
    hF5                        S SK Jr  S SKrS SKrS SKJr  SSKJr   " S S\R                  5      r " S S	\R                  5      r	g)
    )annotationsN)nn   )RoPEAttentionc                     ^  \ rS rSrSr      S           S	U 4S jjjrS
S jr S           SS jjr   S           SS jjrSr	U =r
$ )MemoryAttentionLayer   aX  
Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.

This class combines self-attention, cross-attention, and feedforward components to process input tensors and
generate memory-based attention outputs.

Attributes:
    d_model (int): Dimensionality of the model.
    dim_feedforward (int): Dimensionality of the feedforward network.
    dropout_value (float): Dropout rate for regularization.
    self_attn (RoPEAttention): Self-attention mechanism using RoPE (Rotary Position Embedding).
    cross_attn_image (RoPEAttention): Cross-attention mechanism for image processing.
    linear1 (nn.Linear): First linear layer of the feedforward network.
    linear2 (nn.Linear): Second linear layer of the feedforward network.
    norm1 (nn.LayerNorm): Layer normalization for self-attention output.
    norm2 (nn.LayerNorm): Layer normalization for cross-attention output.
    norm3 (nn.LayerNorm): Layer normalization for feedforward network output.
    dropout1 (nn.Dropout): Dropout layer after self-attention.
    dropout2 (nn.Dropout): Dropout layer after cross-attention.
    dropout3 (nn.Dropout): Dropout layer after feedforward network.
    activation (nn.ReLU): Activation function for the feedforward network.
    pos_enc_at_attn (bool): Flag to add positional encoding at attention.
    pos_enc_at_cross_attn_queries (bool): Flag to add positional encoding to cross-attention queries.
    pos_enc_at_cross_attn_keys (bool): Flag to add positional encoding to cross-attention keys.

Methods:
    forward: Performs the full memory attention operation on input tensors.
    _forward_sa: Performs self-attention on input tensor.
    _forward_ca: Performs cross-attention between target and memory tensors.

Examples:
    >>> layer = MemoryAttentionLayer(d_model=256, dim_feedforward=2048, dropout=0.1)
    >>> tgt = torch.randn(1, 100, 256)
    >>> memory = torch.randn(1, 100, 64)
    >>> pos = torch.randn(1, 100, 256)
    >>> query_pos = torch.randn(1, 100, 256)
    >>> output = layer(tgt, memory, pos, query_pos)
    >>> print(output.shape)
    torch.Size([1, 100, 256])
c                  > [         TU ]  5         Xl        X l        X0l        [        SSSS9U l        [        SSSSSS9U l        [        R                  " X5      U l
        [        R                  " U5      U l        [        R                  " X!5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [        R*                  " 5       U l        X@l        X`l        XPl        g)a3  
Initialize a memory attention layer with self-attention, cross-attention, and feedforward components.

Args:
    d_model (int): Dimensionality of the model.
    dim_feedforward (int): Dimensionality of the feedforward network.
    dropout (float): Dropout rate for regularization.
    pos_enc_at_attn (bool): Whether to add positional encoding at attention.
    pos_enc_at_cross_attn_keys (bool): Whether to add positional encoding to cross-attention keys.
    pos_enc_at_cross_attn_queries (bool): Whether to add positional encoding to cross-attention queries.
   r   )embedding_dim	num_headsdownsample_rateT@   )rope_k_repeatr   r   r   	kv_in_dimN)super__init__d_modeldim_feedforwarddropout_valuer   	self_attncross_attn_imager   Linearlinear1Dropoutdropoutlinear2	LayerNormnorm1norm2norm3dropout1dropout2dropout3ReLU
activationpos_enc_at_attnpos_enc_at_cross_attn_queriespos_enc_at_cross_attn_keys)selfr   r   r   r'   r)   r(   	__class__s          i/home/james-whalen/.local/lib/python3.13/site-packages/ultralytics/models/sam/modules/memory_attention.pyr   MemoryAttentionLayer.__init__7   s   ( 	.$&SAWXY -!
 yy:zz'*yy:\\'*
\\'*
\\'*


7+

7+

7+'')  /-J**D'    c                    U R                  U5      nU R                  (       a  X2-   OU=pEU R                  XEUS9nXR                  U5      -   nU$ )z^Perform self-attention on input tensor using positional encoding and RoPE attention mechanism.)v)r   r'   r   r"   )r*   tgt	query_postgt2qks         r,   _forward_sa MemoryAttentionLayer._forward_sak   sL    zz#$($8$8 dB~~ad~+MM$''
r.   c                (   0 nUS:  a%  [        U R                  [        5      (       d   eSU0nU R                  U5      nU R                  " SU R                  (       a  Xs-   OUU R
                  (       a  X$-   OUUS.UD6nXR                  U5      -   nU$ )zXPerform cross-attention between target and memory tensors using RoPEAttention mechanism.r   num_k_exclude_rope)r4   r5   r0    )
isinstancer   r   r    r(   r)   r#   )r*   r1   memoryr2   posr9   kwdsr3   s           r,   _forward_ca MemoryAttentionLayer._forward_cas   s     !d33]CCCC(*<=D zz#$$ 
"&"D"Dd$"==fl6
 	
 MM$''
r.   c           	        U R                  X5      nU R                  XXCU5      nU R                  U5      nU R                  U R	                  U R                  U R                  U5      5      5      5      nXR                  U5      -   nU$ )ae  
Process input tensors through self-attention, cross-attention, and feedforward network layers.

Args:
    tgt (torch.Tensor): Target tensor for self-attention with shape (N, L, D).
    memory (torch.Tensor): Memory tensor for cross-attention with shape (N, S, D).
    pos (Optional[torch.Tensor]): Positional encoding for memory tensor.
    query_pos (Optional[torch.Tensor]): Positional encoding for target tensor.
    num_k_exclude_rope (int): Number of keys to exclude from rotary position embedding.

Returns:
    (torch.Tensor): Processed tensor after attention and feedforward layers with shape (N, L, D).
)r6   r?   r!   r   r   r&   r   r$   )r*   r1   r<   r=   r2   r9   r3   s          r,   forwardMemoryAttentionLayer.forward   su    * s.sI<NOzz#||DLLd9K)LMNMM$''
r.   )r&   r   r   r   r   r"   r#   r$   r   r   r   r   r    r!   r'   r)   r(   r   )r   i   皙?FTF)r   intr   rE   r   floatr'   boolr)   rG   r(   rG   )r1   torch.Tensorr2   torch.Tensor | NonereturnrH   )r   )r1   rH   r<   rH   r2   rI   r=   rI   r9   rE   rJ   rH   NNr   )r1   rH   r<   rH   r=   rI   r2   rI   r9   rE   rJ   rH   )__name__
__module____qualname____firstlineno____doc__r   r6   r?   rB   __static_attributes____classcell__r+   s   @r,   r   r      s   'V # %+/.32E2E 2E 	2E
 2E %)2E (,2E 2Eh #$  '	
 !   
: $()-"#  !	
 '   
 r.   r   c                  r   ^  \ rS rSrSr S         SU 4S jjjr   S           S	S jjrSrU =r$ )
MemoryAttention   a=  
Memory attention module for processing sequential data with self and cross-attention mechanisms.

This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
for processing sequential data, particularly useful in transformer-like architectures.

Attributes:
    d_model (int): The dimension of the model's hidden state.
    layers (nn.ModuleList): A list of MemoryAttentionLayer modules.
    num_layers (int): The number of attention layers.
    norm (nn.LayerNorm): Layer normalization applied to the output.
    pos_enc_at_input (bool): Whether to apply positional encoding at the input.
    batch_first (bool): Whether the input tensors are in batch-first format.

Methods:
    forward: Processes input tensors through the attention layers.

Examples:
    >>> d_model = 256
    >>> layer = MemoryAttentionLayer(d_model)
    >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
    >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
    >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
    >>> curr_pos = torch.randn(10, 32, d_model)
    >>> memory_pos = torch.randn(20, 32, d_model)
    >>> output = attention(curr, memory, curr_pos, memory_pos)
    >>> print(output.shape)
    torch.Size([10, 32, 256])
c                $  > [         TU ]  5         Xl        [        R                  " [        U5       Vs/ s H  n[        R                  " U5      PM     sn5      U l        X@l	        [        R                  " U5      U l        X l        XPl        gs  snf )a  
Initialize MemoryAttention with specified layers and normalization for sequential data processing.

This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
for processing sequential data, particularly useful in transformer-like architectures.

Args:
    d_model (int): The dimension of the model's hidden state.
    pos_enc_at_input (bool): Whether to apply positional encoding at the input.
    layer (nn.Module): The attention layer to be used in the module.
    num_layers (int): The number of attention layers.
    batch_first (bool): Whether the input tensors are in batch-first format.

Examples:
    >>> d_model = 256
    >>> layer = MemoryAttentionLayer(d_model)
    >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
    >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
    >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
    >>> curr_pos = torch.randn(10, 32, d_model)
    >>> memory_pos = torch.randn(20, 32, d_model)
    >>> output = attention(curr, memory, curr_pos, memory_pos)
    >>> print(output.shape)
    torch.Size([10, 32, 256])
N)r   r   r   r   
ModuleListrangecopydeepcopylayers
num_layersr   normpos_enc_at_inputbatch_first)r*   r   r_   layerr]   r`   _r+   s          r,   r   MemoryAttention.__init__   sl    B 	mm5CT$UCTaT]]5%9CT$UV$LL)	 0&	 %Vs    Bc           	     
   [        U[        5      (       aD  [        U[        5      (       d   e[        U5      [        U5      s=:X  a  S:X  d   e   eUS   US   p1UR                  S   UR                  S   :X  d   S5       eUnU R                  (       a  Ub  USU-  -   nU R
                  (       aH  UR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nU R                   H5  n0 n[        UR                  [        5      (       a  SU0nU" SUUUUS.UD6nM7     U R                  U5      n	U R
                  (       a$  U	R                  SS5      n	UR                  SS5      nU	$ )a  
Process inputs through attention layers, applying self and cross-attention with positional encoding.

Args:
    curr (torch.Tensor): Self-attention input tensor, representing the current state.
    memory (torch.Tensor): Cross-attention input tensor, representing memory information.
    curr_pos (Optional[torch.Tensor]): Positional encoding for self-attention inputs.
    memory_pos (Optional[torch.Tensor]): Positional encoding for cross-attention inputs.
    num_obj_ptr_tokens (int): Number of object pointer tokens to exclude from rotary position embedding.

Returns:
    (torch.Tensor): Processed output tensor after applying attention layers and normalization.

Examples:
    >>> d_model = 256
    >>> layer = MemoryAttentionLayer(d_model)
    >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
    >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
    >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
    >>> curr_pos = torch.randn(10, 32, d_model)
    >>> memory_pos = torch.randn(20, 32, d_model)
    >>> output = attention(curr, memory, curr_pos, memory_pos)
    >>> print(output.shape)
    torch.Size([10, 32, 256])
r   r   z/Batch size must be the same for curr and memoryrD   r9   )r1   r<   r=   r2   r:   )r;   listlenshaper_   r`   	transposer\   r   r   r^   )
r*   currr<   curr_pos
memory_posnum_obj_ptr_tokensoutputra   r>   normed_outputs
             r,   rB   MemoryAttention.forward   s   B dD!!h----t9H222222!!Whqk(zz!}Q/b1bb/  X%9cHn,F%%a+F))!Q/H%%a+F#--a3J[[ED%00-@@,.@A "	
 F ! 		&))33Aq9M))!Q/Hr.   )r`   r   r\   r^   r]   r_   )T)
r   rE   r_   rG   ra   z	nn.Moduler]   rE   r`   rG   rK   )ri   rH   r<   rH   rj   rI   rk   rI   rl   rE   rJ   rH   )	rL   rM   rN   rO   rP   r   rB   rQ   rR   rS   s   @r,   rU   rU      s    H !'''' '' 	''
 '' '' ''Z )-*."#FF F &	F
 (F  F 
F Fr.   rU   )

__future__r   rZ   torchr   blocksr   Moduler   rU   r:   r.   r,   <module>rt      s;    #    !Z299 ZzNbii Nr.   