
    hn:                        S SK Jr  S SKrS SKrS SKJrJr  S SKJr   " S S\R                  5      r	 " S S\R                  5      r
 " S	 S
\R                  5      rg)    )annotationsN)Tensornn)MLPBlockc                     ^  \ rS rSrSr\R                  S4             SU 4S jjjr        SS jrSr	U =r
$ )	TwoWayTransformer   a0  
A Two-Way Transformer module for simultaneous attention to image and query points.

This class implements a specialized transformer decoder that attends to an input image using queries with
supplied positional embeddings. It's useful for tasks like object detection, image segmentation, and point
cloud processing.

Attributes:
    depth (int): Number of layers in the transformer.
    embedding_dim (int): Channel dimension for input embeddings.
    num_heads (int): Number of heads for multihead attention.
    mlp_dim (int): Internal channel dimension for the MLP block.
    layers (nn.ModuleList): List of TwoWayAttentionBlock layers composing the transformer.
    final_attn_token_to_image (Attention): Final attention layer from queries to image.
    norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.

Methods:
    forward: Process image and point embeddings through the transformer.

Examples:
    >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
    >>> image_embedding = torch.randn(1, 256, 32, 32)
    >>> image_pe = torch.randn(1, 256, 32, 32)
    >>> point_embedding = torch.randn(1, 100, 256)
    >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
    >>> print(output_queries.shape, output_image.shape)
   c                R  > [         TU ]  5         Xl        X l        X0l        X@l        [        R                  " 5       U l        [        U5       H-  nU R                  R                  [        UUUUUUS:H  S95        M/     [        X#US9U l        [        R                  " U5      U l        g)a#  
Initialize a Two-Way Transformer for simultaneous attention to image and query points.

Args:
    depth (int): Number of layers in the transformer.
    embedding_dim (int): Channel dimension for input embeddings.
    num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
    mlp_dim (int): Internal channel dimension for the MLP block.
    activation (Type[nn.Module], optional): Activation function to use in the MLP block.
    attention_downsample_rate (int, optional): Downsampling rate for attention mechanism.
r   )embedding_dim	num_headsmlp_dim
activationattention_downsample_rateskip_first_layer_pedownsample_rateN)super__init__depthr   r   r   r   
ModuleListlayersrangeappendTwoWayAttentionBlock	Attentionfinal_attn_token_to_image	LayerNormnorm_final_attn)	selfr   r   r   r   r   r   i	__class__s	           d/home/james-whalen/.local/lib/python3.13/site-packages/ultralytics/models/sam/modules/transformer.pyr   TwoWayTransformer.__init__*   s    ( 	
*"mmouAKK$"/'#).G)*a	  *3=]v)w&!||M:    c                .   UR                  S5      R                  SSS5      nUR                  S5      R                  SSS5      nUnUnU R                   H  nU" UUUUS9u  pEM     XC-   nXR-   nU R                  XxUS9n	XI-   nU R	                  U5      nXE4$ )aD  
Process image and point embeddings through the Two-Way Transformer.

Args:
    image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
    image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
    point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).

Returns:
    queries (torch.Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
    keys (torch.Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
r
   r      )querieskeysquery_pekey_peqkv)flattenpermuter   r   r   )
r    image_embeddingimage_pepoint_embeddingr(   r)   layerr-   r.   attn_outs
             r#   forwardTwoWayTransformer.forwardT   s    & *11!4<<Q1E##A&..q!Q7 " [[E!(	MGT ! %O11Ad1C$&&w/}r%   )r   r   r   r   r   r   r   )r   intr   r9   r   r9   r   r9   r   type[nn.Module]r   r9   returnNone)r2   torch.Tensorr3   r=   r4   r=   r;   !tuple[torch.Tensor, torch.Tensor]__name__
__module____qualname____firstlineno____doc__r   ReLUr   r7   __static_attributes____classcell__r"   s   @r#   r   r      s    D ')gg)*(;(; (; 	(;
 (; $(; $'(; 
(; (;T*%* * &	*
 
+* *r%   r   c                     ^  \ rS rSrSrS\R                  SS4             S	U 4S jjjr          S
S jrSr	U =r
$ )r      a  
A two-way attention block for simultaneous attention to image and query points.

This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
inputs to sparse inputs.

Attributes:
    self_attn (Attention): Self-attention layer for queries.
    norm1 (nn.LayerNorm): Layer normalization after self-attention.
    cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
    norm2 (nn.LayerNorm): Layer normalization after token-to-image attention.
    mlp (MLPBlock): MLP block for transforming query embeddings.
    norm3 (nn.LayerNorm): Layer normalization after MLP block.
    norm4 (nn.LayerNorm): Layer normalization after image-to-token attention.
    cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
    skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.

Methods:
    forward: Apply self-attention and cross-attention to queries and keys.

Examples:
    >>> embedding_dim, num_heads = 256, 8
    >>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
    >>> queries = torch.randn(1, 100, embedding_dim)
    >>> keys = torch.randn(1, 1000, embedding_dim)
    >>> query_pe = torch.randn(1, 100, embedding_dim)
    >>> key_pe = torch.randn(1, 1000, embedding_dim)
    >>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
i   r
   Fc                  > [         TU ]  5         [        X5      U l        [        R
                  " U5      U l        [        XUS9U l        [        R
                  " U5      U l        [        XU5      U l
        [        R
                  " U5      U l        [        R
                  " U5      U l        [        XUS9U l        X`l        g)a0  
Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.

This block implements a specialized transformer layer with four main components: self-attention on sparse
inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
of dense inputs to sparse inputs.

Args:
    embedding_dim (int): Channel dimension of the embeddings.
    num_heads (int): Number of attention heads in the attention layers.
    mlp_dim (int, optional): Hidden dimension of the MLP block.
    activation (Type[nn.Module], optional): Activation function for the MLP block.
    attention_downsample_rate (int, optional): Downsampling rate for the attention mechanism.
    skip_first_layer_pe (bool, optional): Whether to skip positional encoding in the first layer.
r   N)r   r   r   	self_attnr   r   norm1cross_attn_token_to_imagenorm2r   mlpnorm3norm4cross_attn_image_to_tokenr   )r    r   r   r   r   r   r   r"   s          r#   r   TwoWayAttentionBlock.__init__   s    0 	"=<\\-0
)2=]v)w&\\-0
MJ?\\-0
\\-0
)2=]v)w&#6 r%   c                   U R                   (       a  U R                  XUS9nOX-   nU R                  XUUS9nX-   nU R                  U5      nX-   nX$-   nU R                  XWUS9nX-   nU R	                  U5      nU R                  U5      nX-   nU R                  U5      nX-   nX$-   nU R                  XuUS9nX&-   nU R                  U5      nX4$ )ar  
Apply two-way attention to process query and key embeddings in a transformer block.

Args:
    queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
    keys (torch.Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
    query_pe (torch.Tensor): Positional encodings for queries with same shape as queries.
    key_pe (torch.Tensor): Positional encodings for keys with same shape as keys.

Returns:
    queries (torch.Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
    keys (torch.Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
r,   )	r   rL   rM   rN   rO   rP   rQ   rS   rR   )	r    r(   r)   r*   r+   r-   r6   r.   mlp_outs	            r#   r7   TwoWayAttentionBlock.forward   s    " ##nnwWnEG"A~~'~:H(G**W% M11Ad1C$**W% ((7##**W% M11Ag1Fzz$}r%   )	rS   rN   rP   rM   rO   rQ   rR   rL   r   )r   r9   r   r9   r   r9   r   r:   r   r9   r   boolr;   r<   )
r(   r=   r)   r=   r*   r=   r+   r=   r;   r>   r?   rH   s   @r#   r   r      s    F &(gg)*$)%7%7 %7 	%7
 $%7 $'%7 "%7 
%7 %7N,#,+7,CO,Ye,	*, ,r%   r   c                  |   ^  \ rS rSrSr  S         S	U 4S jjjr\S
S j5       r\SS j5       rSS jr	Sr
U =r$ )r      a  
An attention layer with downscaling capability for embedding size after projection.

This class implements a multi-head attention mechanism with the option to downsample the internal
dimension of queries, keys, and values.

Attributes:
    embedding_dim (int): Dimensionality of input embeddings.
    kv_in_dim (int): Dimensionality of key and value inputs.
    internal_dim (int): Internal dimension after downsampling.
    num_heads (int): Number of attention heads.
    q_proj (nn.Linear): Linear projection for queries.
    k_proj (nn.Linear): Linear projection for keys.
    v_proj (nn.Linear): Linear projection for values.
    out_proj (nn.Linear): Linear projection for output.

Methods:
    _separate_heads: Separate input tensor into attention heads.
    _recombine_heads: Recombine separated attention heads.
    forward: Compute attention output for given query, key, and value tensors.

Examples:
    >>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
    >>> q = torch.randn(1, 100, 256)
    >>> k = v = torch.randn(1, 50, 256)
    >>> output = attn(q, k, v)
    >>> print(output.shape)
    torch.Size([1, 100, 256])
c                  > [         TU ]  5         Xl        Ub  UOUU l        X-  U l        X l        U R                  U-  S:X  d   S5       e[        R                  " XR                  5      U l        [        R                  " U R                  U R                  5      U l	        [        R                  " U R                  U R                  5      U l
        [        R                  " U R                  U5      U l        g)a  
Initialize the Attention module with specified dimensions and settings.

Args:
    embedding_dim (int): Dimensionality of input embeddings.
    num_heads (int): Number of attention heads.
    downsample_rate (int, optional): Factor by which internal dimensions are downsampled.
    kv_in_dim (int | None, optional): Dimensionality of key and value inputs. If None, uses embedding_dim.

Raises:
    AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
Nr   z$num_heads must divide embedding_dim.)r   r   r   	kv_in_diminternal_dimr   r   Linearq_projk_projv_projout_proj)r    r   r   r   r\   r"   s        r#   r   Attention.__init__  s    & 	*&/&;)<"  9,1Y3YY1ii/@/@Aii0A0ABii0A0AB		$"3"3]Cr%   c                n    U R                   u  p#nU R                  X#XU-  5      n U R                  SS5      $ )zGSeparate the input tensor into the specified number of attention heads.r'   r
   )shapereshape	transpose)xr   bncs        r#   _separate_headsAttention._separate_heads5  s6     ''aIIaII~6{{1a  r%   c                l    U R                   u  pp4U R                  SS5      n U R                  XX$-  5      $ )z9Recombine separated attention heads into a single tensor.r'   r
   )re   rg   rf   )rh   ri   n_headsn_tokens
c_per_heads        r#   _recombine_headsAttention._recombine_heads<  s6     ,-77(HKK1yyg&:;;r%   c                   U R                  U5      nU R                  U5      nU R                  U5      nU R                  XR                  5      nU R                  X R                  5      nU R                  X0R                  5      nUR
                  u      pEXR                  SSSS5      -  nU[        R                  " U5      -  n[        R                  " USS9nXc-  nU R                  U5      nU R                  U5      $ )a  
Apply multi-head attention to query, key, and value tensors with optional downsampling.

Args:
    q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).
    k (torch.Tensor): Key tensor with shape (B, N_k, embedding_dim).
    v (torch.Tensor): Value tensor with shape (B, N_k, embedding_dim).

Returns:
    (torch.Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
r   r'      r
   )dim)r_   r`   ra   rl   r   re   r1   mathsqrttorchsoftmaxrr   rb   )r    r-   r.   r/   _rq   attnouts           r#   r7   Attention.forwardC  s     KKNKKNKKN   NN3  NN3  NN3  gg1a99Q1a((dii
++}}Tr* h##C(}}S!!r%   )r   r]   r`   r\   r   rb   r_   ra   )r'   N)
r   r9   r   r9   r   r9   r\   r9   r;   r<   )rh   r=   r   r9   r;   r=   )rh   r   r;   r   )r-   r=   r.   r=   r/   r=   r;   r=   )r@   rA   rB   rC   rD   r   staticmethodrl   rr   r7   rF   rG   rH   s   @r#   r   r      s    D  !DD D 	D
 D 
D D> ! ! < <" "r%   r   )
__future__r   rx   rz   r   r   ultralytics.nn.modulesr   Moduler   r   r    r%   r#   <module>r      sN    #    +q		 qhs299 slk"		 k"r%   