
    hz                       S r SSKJr  SSKrSSKrSSKJr  SSKJs  Jr	  SSK
JrJr  SSKJr  SSKJr  SSKJrJrJr  S	r " S
 S\R,                  5      r " S S\5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r " S S\R,                  5      r g)zTransformer modules.    )annotationsN)	constant_xavier_uniform_)
TORCH_1_11   )Conv)_get_clonesinverse_sigmoid#multi_scale_deformable_attn_pytorch)
TransformerEncoderLayerTransformerLayerTransformerBlockMLPBlockLayerNorm2dAIFIDeformableTransformerDecoder!DeformableTransformerDecoderLayerMSDeformAttnMLPc                     ^  \ rS rSrSrSSS\R                  " 5       S4           SU 4S jjjr\SSS jj5       r	   S         SS	 jjr
   S         SS
 jjr   S         SS jjrSrU =r$ )r       a  
A single layer of the transformer encoder.

This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
supporting both pre-normalization and post-normalization configurations.

Attributes:
    ma (nn.MultiheadAttention): Multi-head attention module.
    fc1 (nn.Linear): First linear layer in the feedforward network.
    fc2 (nn.Linear): Second linear layer in the feedforward network.
    norm1 (nn.LayerNorm): Layer normalization after attention.
    norm2 (nn.LayerNorm): Layer normalization after feedforward network.
    dropout (nn.Dropout): Dropout layer for the feedforward network.
    dropout1 (nn.Dropout): Dropout layer after attention.
    dropout2 (nn.Dropout): Dropout layer after feedforward network.
    act (nn.Module): Activation function.
    normalize_before (bool): Whether to apply normalization before attention and feedforward.
              Fc                  > [         TU ]  5         SSKJn  U(       d  [	        S5      e[
        R                  " XUSS9U l        [
        R                  " X5      U l	        [
        R                  " X!5      U l
        [
        R                  " U5      U l        [
        R                  " U5      U l        [
        R                  " U5      U l        [
        R                  " U5      U l        [
        R                  " U5      U l        XPl        X`l        g)a  
Initialize the TransformerEncoderLayer with specified parameters.

Args:
    c1 (int): Input dimension.
    cm (int): Hidden dimension in the feedforward network.
    num_heads (int): Number of attention heads.
    dropout (float): Dropout probability.
    act (nn.Module): Activation function.
    normalize_before (bool): Whether to apply normalization before attention and feedforward.
   )	TORCH_1_9z]TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).T)dropoutbatch_firstN)super__init__utils.torch_utilsr   ModuleNotFoundErrornnMultiheadAttentionmaLinearfc1fc2	LayerNormnorm1norm2Dropoutr   dropout1dropout2actnormalize_before)	selfc1cm	num_headsr   r0   r1   r   	__class__s	           \/home/james-whalen/.local/lib/python3.13/site-packages/ultralytics/nn/modules/transformer.pyr!    TransformerEncoderLayer.__init__4   s    ( 	2%o  ''wTXY99R$99R$\\"%
\\"%
zz'*

7+

7+ 0    c                    Uc  U $ X-   $ )z2Add position embeddings to the tensor if provided. tensorposs     r7   with_pos_embed&TransformerEncoderLayer.with_pos_embed]        v6&,6r9   c           	     Z   U R                  X5      =pVU R                  XVXUS9S   nXR                  U5      -   nU R                  U5      nU R	                  U R                  U R                  U R                  U5      5      5      5      nXR                  U5      -   nU R                  U5      $ )av  
Perform forward pass with post-normalization.

Args:
    src (torch.Tensor): Input tensor.
    src_mask (torch.Tensor, optional): Mask for the src sequence.
    src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
    pos (torch.Tensor, optional): Positional encoding.

Returns:
    (torch.Tensor): Output tensor after attention and feedforward.
value	attn_maskkey_padding_maskr   )
r?   r&   r.   r+   r)   r   r0   r(   r/   r,   )r2   srcsrc_masksrc_key_padding_maskr>   qksrc2s           r7   forward_post$TransformerEncoderLayer.forward_postb   s    & ##C--wwq3MawbcdeMM$''jjoxxTXXdhhsm%<=>MM$''zz#r9   c           	     Z   U R                  U5      nU R                  XT5      =pgU R                  XgXRUS9S   nXR                  U5      -   nU R	                  U5      nU R                  U R                  U R                  U R                  U5      5      5      5      nXR                  U5      -   $ )au  
Perform forward pass with pre-normalization.

Args:
    src (torch.Tensor): Input tensor.
    src_mask (torch.Tensor, optional): Mask for the src sequence.
    src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
    pos (torch.Tensor, optional): Positional encoding.

Returns:
    (torch.Tensor): Output tensor after attention and feedforward.
rC   r   )
r+   r?   r&   r.   r,   r)   r   r0   r(   r/   )r2   rG   rH   rI   r>   rL   rJ   rK   s           r7   forward_pre#TransformerEncoderLayer.forward_pre}   s    & zz###D..wwq4NbwcdefMM$''zz#xxTXXdhhtn%=>?]]4(((r9   c                l    U R                   (       a  U R                  XX45      $ U R                  XX45      $ )a  
Forward propagate the input through the encoder module.

Args:
    src (torch.Tensor): Input tensor.
    src_mask (torch.Tensor, optional): Mask for the src sequence.
    src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
    pos (torch.Tensor, optional): Positional encoding.

Returns:
    (torch.Tensor): Output tensor after transformer encoder layer.
)r1   rP   rM   )r2   rG   rH   rI   r>   s        r7   forwardTransformerEncoderLayer.forward   s5    &   ##C3GMM  0DJJr9   )
r0   r   r.   r/   r(   r)   r&   r+   r,   r1   r3   intr4   rV   r5   rV   r   floatr0   	nn.Moduler1   boolNr=   torch.Tensorr>   torch.Tensor | Nonereturnr\   NNN)
rG   r\   rH   r]   rI   r]   r>   r]   r^   r\   )__name__
__module____qualname____firstlineno____doc__r$   GELUr!   staticmethodr?   rM   rP   rS   __static_attributes____classcell__r6   s   @r7   r   r       sD   , !&'1'1 '1 	'1
 '1 '1 '1 '1R 7 7 )-48#' & 2	
 ! 
< )-48#')) &) 2	)
 !) 
)< )-48#'KK &K 2	K
 !K 
K Kr9   r   c                     ^  \ rS rSrSrSSS\R                  " 5       S4           SU 4S jjjrSU 4S jjr\	 S         SS	 jj5       r
S
rU =r$ )r      z
AIFI transformer layer for 2D data with positional embeddings.

This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
embeddings and handling the spatial dimensions appropriately.
r   r   r   Fc                (   > [         TU ]  XX4XV5        g)a|  
Initialize the AIFI instance with specified parameters.

Args:
    c1 (int): Input dimension.
    cm (int): Hidden dimension in the feedforward network.
    num_heads (int): Number of attention heads.
    dropout (float): Dropout probability.
    act (nn.Module): Activation function.
    normalize_before (bool): Whether to apply normalization before attention and feedforward.
N)r    r!   )r2   r3   r4   r5   r   r0   r1   r6   s          r7   r!   AIFI.__init__   s    ( 	SKr9   c                R  > UR                   SS u  p#nU R                  XCU5      n[        TU ]  UR	                  S5      R                  SSS5      UR                  UR                  UR                  S9S9nUR                  SSS5      R                  SX#U/5      R                  5       $ )z
Forward pass for the AIFI transformer layer.

Args:
    x (torch.Tensor): Input tensor with shape [B, C, H, W].

Returns:
    (torch.Tensor): Output tensor with shape [B, C, H, W].
r   N   r   )devicedtype)r>   )shape"build_2d_sincos_position_embeddingr    rS   flattenpermutetorp   rq   view
contiguous)r2   xchw	pos_embedr6   s         r7   rS   AIFI.forward   s     ''!"+a;;A!D	GOAIIaL00Aq9y||STS[S[cdcjcj|?kOlyyAq!&&A!}5@@BBr9   c                   US-  S:X  d   S5       e[         R                  " U [         R                  S9n[         R                  " U[         R                  S9n[        (       a  [         R                  " XESS9O[         R                  " XE5      u  pEUS-  n[         R                  " U[         R                  S9U-  nSX7-  -  nUR                  5       S   US	   -  nUR                  5       S   US	   -  n	[         R                  " [         R                  " U5      [         R                  " U5      [         R                  " U	5      [         R                  " U	5      /S
5      S	   $ )a?  
Build 2D sine-cosine position embedding.

Args:
    w (int): Width of the feature map.
    h (int): Height of the feature map.
    embed_dim (int): Embedding dimension.
    temperature (float): Temperature for the sine/cosine functions.

Returns:
    (torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
   r   zHEmbed dimension must be divisible by 4 for 2D sin-cos position embeddingrq   ij)indexingg      ?.NNr   )	torcharangefloat32r   meshgridru   catsincos)
r}   r|   	embed_dimtemperaturegrid_wgrid_hpos_dimomegaout_wout_hs
             r7   rt   'AIFI.build_2d_sincos_position_embedding   s     1}!m#mm!au}}5au}}5JT*FZ_ZhZhioZxq.WEMM:WD{)* +eDk9 +eDk9yy%))E*EIIe,<eii>NPUPYPYZ_P`acdefjkkr9   r;   rU   rz   r\   r^   r\   )   g     @)
r}   rV   r|   rV   r   rV   r   rW   r^   r\   )r`   ra   rb   rc   rd   r$   re   r!   rS   rf   rt   rg   rh   ri   s   @r7   r   r      s     !&LL L 	L
 L L L L,C  CJlll#&l;@l	l lr9   r   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r      zeTransformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance).c                X  > [         TU ]  5         [        R                  " XSS9U l        [        R                  " XSS9U l        [        R                  " XSS9U l        [        R                  " XS9U l        [        R                  " XSS9U l	        [        R                  " XSS9U l
        g)z
Initialize a self-attention mechanism using linear transformations and multi-head attention.

Args:
    c (int): Input and output channel dimension.
    num_heads (int): Number of attention heads.
F)bias)r   r5   N)r    r!   r$   r'   rJ   rK   vr%   r&   r(   r)   )r2   r{   r5   r6   s      r7   r!   TransformerLayer.__init__   s|     	1e,1e,1e,''!I99Q.99Q.r9   c                    U R                  U R                  U5      U R                  U5      U R                  U5      5      S   U-   nU R	                  U R                  U5      5      U-   $ )z
Apply a transformer block to the input x and return the output.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after transformer layer.
r   )r&   rJ   rK   r   r)   r(   r2   rz   s     r7   rS   TransformerLayer.forward  sT     GGDFF1Itvvay$&&)4Q7!;xx$q((r9   )r(   r)   rK   r&   rJ   r   )r{   rV   r5   rV   r   	r`   ra   rb   rc   rd   r!   rS   rg   rh   ri   s   @r7   r   r      s    o/ ) )r9   r   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r   i  a  
Vision Transformer block based on https://arxiv.org/abs/2010.11929.

This class implements a complete transformer block with optional convolution layer for channel adjustment,
learnable position embedding, and multiple transformer layers.

Attributes:
    conv (Conv, optional): Convolution layer if input and output channels differ.
    linear (nn.Linear): Learnable position embedding.
    tr (nn.Sequential): Sequential container of transformer layers.
    c2 (int): Output channel dimension.
c                   >^^ [         TU ]  5         SU l        UT:w  a  [        UT5      U l        [        R
                  " TT5      U l        [        R                  " UU4S j[        U5       5       6 U l	        TU l
        g)a  
Initialize a Transformer module with position embedding and specified number of heads and layers.

Args:
    c1 (int): Input channel dimension.
    c2 (int): Output channel dimension.
    num_heads (int): Number of attention heads.
    num_layers (int): Number of transformer layers.
Nc              3  <   >#    U  H  n[        TT5      v   M     g 7frZ   )r   ).0_c2r5   s     r7   	<genexpr>,TransformerBlock.__init__.<locals>.<genexpr>:  s     !]K\a"22y"A"AK\s   )r    r!   convr   r$   r'   linear
Sequentialrangetrr   )r2   r3   r   r5   
num_layersr6   s     `` r7   r!   TransformerBlock.__init__+  s`     		8RDIiiB'--!]5Q[K\!]^r9   c                :   U R                   b  U R                  U5      nUR                  u  p#pEUR                  S5      R                  SSS5      nU R	                  X`R                  U5      -   5      R                  SSS5      R                  X R                  XE5      $ )z
Forward propagate the input through the transformer block.

Args:
    x (torch.Tensor): Input tensor with shape [b, c1, w, h].

Returns:
    (torch.Tensor): Output tensor with shape [b, c2, w, h].
ro   r   r   )r   rs   ru   rv   r   r   reshaper   )r2   rz   br   r}   r|   ps          r7   rS   TransformerBlock.forward=  s     99 		!AWW
aIIaL  Aq)wwq;;q>)*221a;CCAwwPQUUr9   )r   r   r   r   )r3   rV   r   rV   r5   rV   r   rV   r   r   ri   s   @r7   r   r     s    $V Vr9   r   c                  T   ^  \ rS rSrSr\R                  4SU 4S jjjrSS jrSr	U =r
$ )r   iN  z+A single block of a multi-layer perceptron.c                   > [         TU ]  5         [        R                  " X5      U l        [        R                  " X!5      U l        U" 5       U l        g)z
Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.

Args:
    embedding_dim (int): Input and output dimension.
    mlp_dim (int): Hidden dimension.
    act (nn.Module): Activation function.
N)r    r!   r$   r'   lin1lin2r0   )r2   embedding_dimmlp_dimr0   r6   s       r7   r!   MLPBlock.__init__Q  s9     	IIm5	IIg5	5r9   c                `    U R                  U R                  U R                  U5      5      5      $ )z
Forward pass for the MLPBlock.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after MLP block.
)r   r0   r   r   s     r7   rS   MLPBlock.forward_  s$     yy$))A,/00r9   )r0   r   r   )r   rV   r   rV   r   )r`   ra   rb   rc   rd   r$   re   r!   rS   rg   rh   ri   s   @r7   r   r   N  s!    5=?WW  
1 
1r9   r   c                  h   ^  \ rS rSrSr\R                  S4         SU 4S jjjrSS jrSr	U =r
$ )	r   il  a  
A simple multi-layer perceptron (also called FFN).

This class implements a configurable MLP with multiple linear layers, activation functions, and optional
sigmoid output activation.

Attributes:
    num_layers (int): Number of layers in the MLP.
    layers (nn.ModuleList): List of linear layers.
    sigmoid (bool): Whether to apply sigmoid to the output.
    act (nn.Module): Activation function.
Fc                   > [         TU ]  5         X@l        U/US-
  -  n[        R                  " S [        U/U-   Xs/-   5       5       5      U l        X`l        U" 5       U l        g)ae  
Initialize the MLP with specified input, hidden, output dimensions and number of layers.

Args:
    input_dim (int): Input dimension.
    hidden_dim (int): Hidden dimension.
    output_dim (int): Output dimension.
    num_layers (int): Number of layers.
    act (nn.Module): Activation function.
    sigmoid (bool): Whether to apply sigmoid to the output.
r   c              3  R   #    U  H  u  p[         R                  " X5      v   M     g 7frZ   )r$   r'   )r   nrK   s      r7   r   MLP.__init__.<locals>.<genexpr>  s     #g@fBIIaOO@fs   %'N)	r    r!   r   r$   
ModuleListziplayerssigmoidr0   )	r2   	input_dim
hidden_dim
output_dimr   r0   r   r|   r6   s	           r7   r!   MLP.__init__z  s^     	$LJN+mm#gYKRSOUVYeUe@f#gg5r9   c                   [        U R                  5       HK  u  p#X R                  S-
  :  a,  [        U S[        R
                  " 5       5      " U" U5      5      OU" U5      nMM     [        U SS5      (       a  UR                  5       $ U$ )z
Forward pass for the entire MLP.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after MLP.
r   r0   r   F)	enumerater   r   getattrr$   ReLUr   )r2   rz   ilayers       r7   rS   MLP.forward  sm     "$++.HA=>STAT=TeRWWY/a9Z_`aZbA /%dIu==qyy{D1Dr9   )r0   r   r   r   )
r   rV   r   rV   r   rV   r   rV   r   rY   r   )r`   ra   rb   rc   rd   r$   r   r!   rS   rg   rh   ri   s   @r7   r   r   l  sV     VXU\U\ns*-;>LOgk *E Er9   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r   i  aI  
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
while preserving spatial dimensions.

Attributes:
    weight (nn.Parameter): Learnable scale parameter.
    bias (nn.Parameter): Learnable bias parameter.
    eps (float): Small constant for numerical stability.

References:
    https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
    https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
c                   > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        [        R                  " [        R                  " U5      5      U l        X l	        g)z
Initialize LayerNorm2d with the given parameters.

Args:
    num_channels (int): Number of channels in the input.
    eps (float): Small constant for numerical stability.
N)
r    r!   r$   	Parameterr   onesweightzerosr   eps)r2   num_channelsr   r6   s      r7   r!   LayerNorm2d.__init__  sG     	ll5::l#;<LL\!:;	r9   c                
   UR                  SSS9nX-
  R                  S5      R                  SSS9nX-
  [        R                  " X0R                  -   5      -  nU R
                  SS2SS4   U-  U R                  SS2SS4   -   $ )z
Perform forward pass for 2D layer normalization.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Normalized output tensor.
r   Tkeepdimro   N)meanpowr   sqrtr   r   r   )r2   rz   uss       r7   rS   LayerNorm2d.forward  s~     FF1dF#UKKN40UejjXX..{{1dD=)A-		!T4-0HHHr9   )r   r   r   )gư>)r   rV   r   rW   r   r   ri   s   @r7   r   r     s      I Ir9   r   c                  `   ^  \ rS rSrSrSSU 4S jjjrS r S	           S
S jjrSrU =r	$ )r   i  a  
Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

This module implements multiscale deformable attention that can attend to features at multiple scales
with learnable sampling locations and attention weights.

Attributes:
    im2col_step (int): Step size for im2col operations.
    d_model (int): Model dimension.
    n_levels (int): Number of feature levels.
    n_heads (int): Number of attention heads.
    n_points (int): Number of sampling points per attention head per feature level.
    sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
    attention_weights (nn.Linear): Linear layer for generating attention weights.
    value_proj (nn.Linear): Linear layer for projecting values.
    output_proj (nn.Linear): Linear layer for projecting output.

References:
    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
c                  > [         TU ]  5         X-  S:w  a  [        SU SU 35      eX-  nXS-  U:X  d   S5       eSU l        Xl        X l        X0l        X@l        [        R                  " XU-  U-  S-  5      U l
        [        R                  " XU-  U-  5      U l        [        R                  " X5      U l        [        R                  " X5      U l        U R                  5         g)a  
Initialize MSDeformAttn with the given parameters.

Args:
    d_model (int): Model dimension.
    n_levels (int): Number of feature levels.
    n_heads (int): Number of attention heads.
    n_points (int): Number of sampling points per attention head per feature level.
r   z.d_model must be divisible by n_heads, but got z and z(`d_model` must be divisible by `n_heads`@   ro   N)r    r!   
ValueErrorim2col_stepd_modeln_levelsn_headsn_pointsr$   r'   sampling_offsetsattention_weights
value_projoutput_proj_reset_parameters)r2   r   r   r   r   _d_per_headr6   s         r7   r!   MSDeformAttn.__init__  s     	!MgYV[\c[deff($/[1[[/   "		'X3E3PST3T U!#7h4F4Q!R))G599W6 r9   c                P   [        U R                  R                  R                  S5        [        R
                  " U R                  [        R                  S9S[        R                  -  U R                  -  -  n[        R                  " UR                  5       UR                  5       /S5      nX"R                  5       R                  SSS9S   -  R                  U R                  SSS	5      R!                  SU R"                  U R$                  S5      n['        U R$                  5       H  nUS
S
2S
S
2US
S
24==   US-   -  ss'   M     [        R(                  " 5          [*        R,                  " UR                  S5      5      U R                  l        S
S
S
5        [        U R0                  R                  R                  S5        [        U R0                  R.                  R                  S5        [3        U R4                  R                  R                  5        [        U R4                  R.                  R                  S5        [3        U R6                  R                  R                  5        [        U R6                  R.                  R                  S5        g
! , (       d  f       GN
= f)zReset module parameters.r   r   g       @rr   Tr   r   r   ro   N)r   r   r   datar   r   r   r   mathpistackr   r   absmaxrx   repeatr   r   r   no_gradr$   r   r   r   r   r   r   )r2   thetas	grid_initr   s       r7   r   MSDeformAttn._reset_parameters  s   $''..33S9dll%--@C$''MTXT`T`D`aKKvzz| <bA	,,R,>qAAT$,,1a(VAt}}dmmQ7 	
 t}}%AaAqj!QU*! &]]_)+innR6H)ID!!& $((//44c:$((--22C8..334$//&&++S1((//445$""'',,c2 _s   5J
J%c           	        UR                   SS u  pgUR                   S   n[        S U 5       5      U:X  d   eU R                  U5      nUb  UR                  US   [	        S5      5      nUR                  XhU R                  U R                  U R                  -  5      nU R                  U5      R                  XgU R                  U R                  U R                  S5      n	U R                  U5      R                  XgU R                  U R                  U R                  -  5      n
[        R                  " U
S5      R                  XgU R                  U R                  U R                  5      n
UR                   S   nUS:X  a`  [        R                  " XAR                   UR"                  S9R%                  S5      nXSSSSS2SSS24   -  nUSS2SS2SSS2SSS24   U-   nOQUS	:X  a<  XR                  -  USS2SS2SSS2SSS24   -  S
-  nUSS2SS2SSS2SSS24   U-   nO['        SU S35      e[)        X4X5      nU R+                  U5      $ )ad  
Perform forward pass for multiscale deformable attention.

Args:
    query (torch.Tensor): Query tensor with shape [bs, query_length, C].
    refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
        range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
    value (torch.Tensor): Value tensor with shape [bs, value_length, C].
    value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
    value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
        elements, False for padding elements.

Returns:
    (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].

References:
    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
Nro   r   c              3  6   #    U  H  oS    US   -  v   M     g7f)r   r   Nr;   )r   r   s     r7   r   'MSDeformAttn.forward.<locals>.<genexpr>2  s     51Q4!A$;s   r   r   rr   )rq   rp   r   g      ?z5Last dim of reference_points must be 2 or 4, but got .)rs   sumr   masked_fillrW   rx   r   r   r   r   r   r   Fsoftmaxr   	as_tensorrq   rp   flipr   r   r   )r2   query
refer_bboxrD   value_shapes
value_maskbslen_qlen_vr   r   
num_pointsoffset_normalizeraddsampling_locationsoutputs                   r7   rS   MSDeformAttn.forward  s2   4 KKO	A555>>>&!%%j&;U1XFE

2dllDLLDLL4PQ007<<RVZVcVceierertuv 2259>>r$,,X\XeXehlhuhuXuvII&7<AA"T\\[_[h[hjnjwjwx%%b)
? %KKX]XdXd e j jkm n"tT4DRS7S%TTC!+Aq$4,B!Cc!I1_"]]2Z1dAtUVUW@W5XX[^^C!+Aq$4!,C!Ds!JTU_T``abcc4UJ\p''r9   )	r   r   r   r   r   r   r   r   r   )r   r   r   r   )r   rV   r   rV   r   rV   r   rV   rZ   )r  r\   r  r\   rD   r\   r  listr  r]   r^   r\   )
r`   ra   rb   rc   rd   r!   r   rS   rg   rh   ri   s   @r7   r   r     s`    *! !>36 +/1(1( !1( 	1(
 1( (1( 
1( 1(r9   r   c                     ^  \ rS rSrSrSSSS\R                  " 5       SS4             SU 4S jjjr\SS	 j5       r	SS
 jr
   S               SS jjrSrU =r$ )r   iJ  a  
Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
attention, and a feedforward network.

Attributes:
    self_attn (nn.MultiheadAttention): Self-attention module.
    dropout1 (nn.Dropout): Dropout after self-attention.
    norm1 (nn.LayerNorm): Layer normalization after self-attention.
    cross_attn (MSDeformAttn): Cross-attention module.
    dropout2 (nn.Dropout): Dropout after cross-attention.
    norm2 (nn.LayerNorm): Layer normalization after cross-attention.
    linear1 (nn.Linear): First linear layer in the feedforward network.
    act (nn.Module): Activation function.
    dropout3 (nn.Dropout): Dropout in the feedforward network.
    linear2 (nn.Linear): Second linear layer in the feedforward network.
    dropout4 (nn.Dropout): Dropout after the feedforward network.
    norm3 (nn.LayerNorm): Layer normalization after the feedforward network.

References:
    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
r   r   i   r   r   c                j  > [         TU ]  5         [        R                  " XUS9U l        [        R
                  " U5      U l        [        R                  " U5      U l        [        XX'5      U l
        [        R
                  " U5      U l        [        R                  " U5      U l        [        R                  " X5      U l        XPl        [        R
                  " U5      U l        [        R                  " X15      U l        [        R
                  " U5      U l        [        R                  " U5      U l        g)a  
Initialize the DeformableTransformerDecoderLayer with the given parameters.

Args:
    d_model (int): Model dimension.
    n_heads (int): Number of attention heads.
    d_ffn (int): Dimension of the feedforward network.
    dropout (float): Dropout probability.
    act (nn.Module): Activation function.
    n_levels (int): Number of feature levels.
    n_points (int): Number of sampling points.
)r   N)r    r!   r$   r%   	self_attnr-   r.   r*   r+   r   
cross_attnr/   r,   r'   linear1r0   dropout3linear2dropout4norm3)	r2   r   r   d_ffnr   r0   r   r   r6   s	           r7   r!   *DeformableTransformerDecoderLayer.__init__d  s    , 	 ..wQ

7+\\'*
 'w'L

7+\\'*
 yy0

7+yy0

7+\\'*
r9   c                    Uc  U $ X-   $ )z;Add positional embeddings to the input tensor, if provided.r;   r<   s     r7   r?   0DeformableTransformerDecoderLayer.with_pos_embed  rA   r9   c           	         U R                  U R                  U R                  U R                  U5      5      5      5      nXR	                  U5      -   nU R                  U5      $ )z
Perform forward pass through the Feed-Forward Network part of the layer.

Args:
    tgt (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after FFN.
)r  r  r0   r  r   r!  )r2   tgttgt2s      r7   forward_ffn-DeformableTransformerDecoderLayer.forward_ffn  sL     ||DMM$((4<<3D*EFGMM$''zz#r9   c                   U R                  X5      =pU R                  UR                  SS5      U	R                  SS5      UR                  SS5      US9S   R                  SS5      n
XR                  U
5      -   nU R	                  U5      nU R                  U R                  X5      UR                  S5      X4U5      n
XR                  U
5      -   nU R                  U5      nU R                  U5      $ )a  
Perform the forward pass through the entire decoder layer.

Args:
    embed (torch.Tensor): Input embeddings.
    refer_bbox (torch.Tensor): Reference bounding boxes.
    feats (torch.Tensor): Feature maps.
    shapes (list): Feature shapes.
    padding_mask (torch.Tensor, optional): Padding mask.
    attn_mask (torch.Tensor, optional): Attention mask.
    query_pos (torch.Tensor, optional): Query position embeddings.

Returns:
    (torch.Tensor): Output tensor after decoder layer.
r   r   )rE   ro   )
r?   r  	transposer.   r+   r  	unsqueezer/   r,   r)  )r2   embedr  featsshapespadding_maskrE   	query_posrJ   rK   r'  s              r7   rS   )DeformableTransformerDecoderLayer.forward  s    4 ##E55nnQ[[A.Aq0A5??STVWCXdmnn

)Aq/ 	 c**

5! oo1:3G3G3JE[g
 c**

5! &&r9   )r0   r  r.   r/   r  r   r  r  r+   r,   r!  r  )r   rV   r   rV   r"  rV   r   rW   r0   rX   r   rV   r   rV   r[   )r'  r\   r^   r\   r_   )r.  r\   r  r\   r/  r\   r0  r  r1  r]   rE   r]   r2  r]   r^   r\   )r`   ra   rb   rc   rd   r$   r   r!   rf   r?   r)  rS   rg   rh   ri   s   @r7   r   r   J  s    6 (+(+ (+ 	(+
 (+ (+ (+ (+ (+T 7 7( -1)-)-)')' !)' 	)'
 )' *)' ')' ')' 
)' )'r9   r   c                  h   ^  \ rS rSrSrSSU 4S jjjr  S                 S	S jjrSrU =r$ )
r   i  aJ  
Deformable Transformer Decoder based on PaddleDetection implementation.

This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
heads for bounding box regression and classification.

Attributes:
    layers (nn.ModuleList): List of decoder layers.
    num_layers (int): Number of decoder layers.
    hidden_dim (int): Hidden dimension.
    eval_idx (int): Index of the layer to use during evaluation.

References:
    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
c                   > [         TU ]  5         [        X#5      U l        X0l        Xl        US:  a  X@l        gX4-   U l        g)a  
Initialize the DeformableTransformerDecoder with the given parameters.

Args:
    hidden_dim (int): Hidden dimension.
    decoder_layer (nn.Module): Decoder layer module.
    num_layers (int): Number of decoder layers.
    eval_idx (int): Index of the layer to use during evaluation.
r   N)r    r!   r	   r   r   r   eval_idx)r2   r   decoder_layerr   r6  r6   s        r7   r!   %DeformableTransformerDecoder.__init__  s:     	!-<$$$,Mz7Lr9   c
                   Un
/ n/ nSnUR                  5       n[        U R                  5       GH  u  pU" XX4XU" U5      5      n
X^   " U
5      n[        R                   " U[	        U5      -   5      nU R
                  (       ac  UR                  Xn   " U
5      5        US:X  a  UR                  U5        OmUR                  [        R                   " U[	        U5      -   5      5        O;XR                  :X  a,  UR                  Xn   " U
5      5        UR                  U5          O*UnU R
                  (       a  UR                  5       OUnGM     [        R                  " U5      [        R                  " U5      4$ )aq  
Perform the forward pass through the entire decoder.

Args:
    embed (torch.Tensor): Decoder embeddings.
    refer_bbox (torch.Tensor): Reference bounding boxes.
    feats (torch.Tensor): Image features.
    shapes (list): Feature shapes.
    bbox_head (nn.Module): Bounding box prediction head.
    score_head (nn.Module): Score prediction head.
    pos_mlp (nn.Module): Position MLP.
    attn_mask (torch.Tensor, optional): Attention mask.
    padding_mask (torch.Tensor, optional): Padding mask.

Returns:
    dec_bboxes (torch.Tensor): Decoded bounding boxes.
    dec_cls (torch.Tensor): Decoded classification scores.
Nr   )
r   r   r   r   r
   trainingappendr6  detachr   )r2   r.  r  r/  r0  	bbox_head
score_headpos_mlprE   r1  r  
dec_bboxesdec_clslast_refined_bboxr   r   bboxrefined_bboxs                     r7   rS   $DeformableTransformerDecoder.forward  s1   < 
 '')
!$++.HA6ulW^_iWjkF<'D ==
0K)KLL}}z}V456%%l3%%emmD?K\;]4]&^_mm#z}V45!!,/ ,26--,,.\J% /( {{:&G(<<<r9   )r6  r   r   r   )rr   )r   rV   r7  rX   r   rV   r6  rV   )NN)r.  r\   r  r\   r/  r\   r0  r  r=  rX   r>  rX   r?  rX   rE   r]   r1  r]   r   ri   s   @r7   r   r     s~     M M2 *.,07=7= !7= 	7=
 7= 7= 7= 7= '7= *7= 7=r9   r   )!rd   
__future__r   r   r   torch.nnr$   torch.nn.functional
functionalr  torch.nn.initr   r   ultralytics.utils.torch_utilsr   r   r   utilsr	   r
   r   __all__Moduler   r   r   r   r   r   r   r   r   r   r;   r9   r7   <module>rO     s     "      4 4  T TMKbii MK`Il" IlX)ryy )B.Vryy .Vb1ryy 1</E")) /Ed+I")) +I\{(299 {(|@'		 @'FX=299 X=r9   