
    hd                        S SK Jr  S SKrS SKJr  S SKJrJr   " S S\R                  5      r " S S\R                  5      r	g)	    )annotationsN)nn)MLPLayerNorm2dc                     ^  \ rS rSrSrS\R                  SS4             S	U 4S jjjr            S
S jr          SS jr	Sr
U =r$ )MaskDecoder   a  
Decoder module for generating masks and their associated quality scores using a transformer architecture.

This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
generate mask predictions along with their quality scores.

Attributes:
    transformer_dim (int): Channel dimension for the transformer module.
    transformer (nn.Module): Transformer module used for mask prediction.
    num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
    iou_token (nn.Embedding): Embedding for the IoU token.
    num_mask_tokens (int): Number of mask tokens.
    mask_tokens (nn.Embedding): Embedding for the mask tokens.
    output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
    output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
    iou_prediction_head (nn.Module): MLP for predicting mask quality.

Methods:
    forward: Predict masks given image and prompt embeddings.
    predict_masks: Internal method for mask prediction.

Examples:
    >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
    >>> masks, iou_pred = decoder(
    ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, multimask_output=True
    ... )
    >>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
      c                  > [         TU ]  5         Xl        X l        X0l        [
        R                  " SU5      U l        US-   U l        [
        R                  " U R                  U5      U l	        [
        R                  " [
        R                  " XS-  SSS9[        US-  5      U" 5       [
        R                  " US-  US-  SSS9U" 5       5      U l        [
        R                  " [        U R                  5       Vs/ s H  n[!        XUS-  S5      PM     sn5      U l        [!        XU R                  U5      U l        gs  snf )a  
Initialize the MaskDecoder module for generating masks and their associated quality scores.

Args:
    transformer_dim (int): Channel dimension for the transformer module.
    transformer (nn.Module): Transformer module used for mask prediction.
    num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
    activation (Type[nn.Module]): Type of activation to use when upscaling masks.
    iou_head_depth (int): Depth of the MLP used to predict mask quality.
    iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.

Examples:
    >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
    >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer)
    >>> print(decoder)
         kernel_sizestride   r
   N)super__init__transformer_dimtransformernum_multimask_outputsr   	Embedding	iou_tokennum_mask_tokensmask_tokens
SequentialConvTranspose2dr   output_upscaling
ModuleListranger   output_hypernetworks_mlpsiou_prediction_head)	selfr   r   r   
activationiou_head_depthiou_head_hidden_dim_	__class__s	           a/home/james-whalen/.local/lib/python3.13/site-packages/ultralytics/models/sam/modules/decoders.pyr   MaskDecoder.__init__)   s   2 	.&%:"a94q8<<(<(<oN "10DRS\]^1,-L!3_5IWXabcL!
 *,UZ[_[o[oUpqUpPQS?a3GKUpq*
& $'TMaMacq#r  rs   <D?c                    U R                  UUUUS9u  pgU(       a  [        SS5      O[        SS5      nUSS2USS2SS24   nUSS2U4   nXg4$ )a  
Predict masks given image and prompt embeddings.

Args:
    image_embeddings (torch.Tensor): Embeddings from the image encoder.
    image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
    sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
    dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
    multimask_output (bool): Whether to return multiple masks or a single mask.

Returns:
    masks (torch.Tensor): Batched predicted masks.
    iou_pred (torch.Tensor): Batched predictions of mask quality.

Examples:
    >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
    >>> image_emb = torch.rand(1, 256, 64, 64)
    >>> image_pe = torch.rand(1, 256, 64, 64)
    >>> sparse_emb = torch.rand(1, 2, 256)
    >>> dense_emb = torch.rand(1, 256, 64, 64)
    >>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
    >>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsr   Nr   )predict_masksslice)	r$   r-   r.   r/   r0   multimask_outputmasksiou_pred
mask_slices	            r*   forwardMaskDecoder.forwardY   sg    > ,,-%=$;	 - 
 (8U1d^U1a[
aQ)*AzM*    c           
        [         R                  " U R                  R                  U R                  R                  /SS9nUR                  S5      R                  UR                  S   SS5      n[         R                  " XS4SS9n[         R                  " XR                  S   SS9nXt-   n[         R                  " X&R                  S   SS9nUR                  u  ppU R                  XxU5      u  pUSS2SSS24   nUSS2SSU R                  -   2SS24   nUR                  SS5      R                  XX5      nU R                  U5      n[        U R                  5       Vs/ s H"  nU R                  U   " USS2USS24   5      PM$     nn[         R                   " USS9nUR                  u  ppUUR                  XX-  5      -  R                  U	SX5      nU R#                  U5      nUU4$ s  snf )z`Predict masks and quality scores using image and prompt embeddings via transformer architecture.r   dimr   Nr   )torchcatr   weightr   	unsqueezeexpandshaperepeat_interleaver   r   	transposeviewr   r!   r"   stackr#   )r$   r-   r.   r/   r0   output_tokenstokenssrcpos_srcbchwhsiou_token_outmask_tokens_outupscaled_embeddingihyper_in_listhyper_inr4   r5   s                         r*   r1   MaskDecoder.predict_masks   s    		4>>#8#8$:J:J:Q:Q"RXYZ%//299:R:X:XYZ:[]_acdMD!L %%&6QQO+))(LLOKYY
a ""381a7QQ)=)=%= >AB mmAq!&&qQ2!2237QVW[WkWkQl-
QlAD**1-oaAg.FGQl 	 -
 ;;}!4'--
a.33A!%@@FFq"aS ++M:h-
s   !)G-)	r#   r   r   r   r   r"   r   r   r   )r   intr   	nn.Moduler   rX   r%   type[nn.Module]r&   rX   r'   rX   returnNone)r-   torch.Tensorr.   r]   r/   r]   r0   r]   r3   boolr[   !tuple[torch.Tensor, torch.Tensor])
r-   r]   r.   r]   r/   r]   r0   r]   r[   r_   )__name__
__module____qualname____firstlineno____doc__r   GELUr   r7   r1   __static_attributes____classcell__r)   s   @r*   r   r      s    B &'&(gg#&.s.s .s  #	.s
 $.s .s !.s 
.s .s`+&+ + #/	+
 ".+ + 
++Z%&% % #/	%
 ".% 
+% %r9   r   c                     ^  \ rS rSrSrS\R                  SSSSSSSSSS4                     SU 4S jjjr S               SS	 jjr S             SS
 jjr	S r
S rSrU =r$ )SAM2MaskDecoder   a	  
Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.

This class extends the functionality of the MaskDecoder, incorporating additional features such as
high-resolution feature processing, dynamic multimask output, and object score prediction.

Attributes:
    transformer_dim (int): Channel dimension of the transformer.
    transformer (nn.Module): Transformer used to predict masks.
    num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
    iou_token (nn.Embedding): Embedding for IOU token.
    num_mask_tokens (int): Total number of mask tokens.
    mask_tokens (nn.Embedding): Embedding for mask tokens.
    pred_obj_scores (bool): Whether to predict object scores.
    obj_score_token (nn.Embedding): Embedding for object score token.
    use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
    output_upscaling (nn.Sequential): Upscaling layers for output.
    use_high_res_features (bool): Whether to use high-resolution features.
    conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
    conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
    output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
    iou_prediction_head (MLP): MLP for IOU prediction.
    pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
    dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
    dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
    dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.

Methods:
    forward: Predict masks given image and prompt embeddings.
    predict_masks: Predict instance segmentation masks from image and prompt embeddings.
    _get_stability_scores: Compute mask stability scores based on IoU between thresholds.
    _dynamic_multimask_via_stability: Dynamically select the most stable mask output.

Examples:
    >>> image_embeddings = torch.rand(1, 256, 64, 64)
    >>> image_pe = torch.rand(1, 256, 64, 64)
    >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
    >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
    >>> decoder = SAM2MaskDecoder(256, transformer)
    >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
    ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
    ... )
r
   r   Fg?g\(\?c                <  > [         TU ]  5         Xl        X l        X0l        [
        R                  " SU5      U l        US-   U l        [
        R                  " U R                  U5      U l	        Xl
        U R                  (       a  [
        R                  " SU5      U l        Xl        [
        R                  " [
        R                  " XS-  SSS9[        US-  5      U" 5       [
        R                  " US-  US-  SSS9U" 5       5      U l        Xpl        U(       a<  [
        R$                  " XS-  SSS9U l        [
        R$                  " XS-  SSS9U l        [
        R*                  " [-        U R                  5       Vs/ s H  n[/        XUS-  S5      PM     sn5      U l        [/        UUU R                  UUS9U l        U R                  (       a5  [
        R4                  " US5      U l        U(       a  [/        XSS5      U l        Xl        Xl        Xl        gs  snf )	a^  
Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.

This decoder extends the functionality of MaskDecoder, incorporating additional features such as
high-resolution feature processing, dynamic multimask output, and object score prediction.

Args:
    transformer_dim (int): Channel dimension of the transformer.
    transformer (nn.Module): Transformer used to predict masks.
    num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
    activation (Type[nn.Module]): Type of activation to use when upscaling masks.
    iou_head_depth (int): Depth of the MLP used to predict mask quality.
    iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
    use_high_res_features (bool): Whether to use high-resolution features.
    iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
    dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
    dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
    dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
    pred_obj_scores (bool): Whether to predict object scores.
    pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
    use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.

Examples:
    >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
    >>> decoder = SAM2MaskDecoder(transformer_dim=256, transformer=transformer)
    >>> print(decoder)
r   r   r   r   r   r
   )sigmoidN)r   r   r   r   r   r   r   r   r   r   pred_obj_scoresobj_score_tokenuse_multimask_token_for_obj_ptrr   r   r   r   use_high_res_featuresConv2dconv_s0conv_s1r    r!   r   r"   r#   Linearpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)r$   r   r   r   r%   r&   r'   rq   iou_prediction_use_sigmoidrw   rx   ry   rn   pred_obj_scores_mlprp   r(   r)   s                   r*   r   SAM2MaskDecoder.__init__   s   X 	.&%:"a94q8<<(<(<oN.#%<<?#CD /N, "10DRS\]^1,-L!3_5IWXabcL!
 &;" 99_6JXYbcdDL99_6JXYbcdDL)+UZ[_[o[oUpqUpPQS?a3GKUpq*
& $'  .$
  ')yy!'DD$"+.QRTU+V( 0O,1R.2T/' rs   >Hc           	        U R                  UUUUUUS9u  ppU(       a  USS2SS2SS2SS24   nU	SS2SS24   n	ORU R                  (       a%  U R                  (       d  U R                  X5      u  pOUSS2SS2SS2SS24   nU	SS2SS24   n	U(       a  U R                  (       a  U
SS2SS24   nOU
SS2SS24   nXX4$ )a  
Predict masks given image and prompt embeddings.

Args:
    image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
    image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
    sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
    dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
    multimask_output (bool): Whether to return multiple masks or a single mask.
    repeat_image (bool): Flag to repeat the image embeddings.
    high_res_features (list[torch.Tensor] | None, optional): Optional high-resolution features.

Returns:
    masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
    iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
    sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
    object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).

Examples:
    >>> image_embeddings = torch.rand(1, 256, 64, 64)
    >>> image_pe = torch.rand(1, 256, 64, 64)
    >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
    >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
    >>> decoder = SAM2MaskDecoder(256, transformer)
    >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
    ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
    ... )
)r-   r.   r/   r0   repeat_imagehigh_res_featuresNr   r   )r1   rw   training _dynamic_multimask_via_stabilityrp   )r$   r-   r.   r/   r0   r3   r~   r   r4   r5   rR   object_score_logitssam_tokens_outs                r*   r7   SAM2MaskDecoder.forward8  s    L AE@R@R-%=$;%/ AS A
= !QRA+&E12H11$--"CCETOE8!QqS!Q,'E1Q3'H D D,QU3N -Q!V4NCCr9   c           
        SnU R                   (       aW  [        R                  " U R                  R                  U R
                  R                  U R                  R                  /SS9nSnO?[        R                  " U R
                  R                  U R                  R                  /SS9nUR                  S5      R                  UR                  S   SS5      n[        R                  " X4SS9n	U(       a#  [        R                  " XR                  S   SS9n
O$UR                  S   U	R                  S   :X  d   eUn
X-   n
UR                  S   S:X  d   S5       e[        R                  " X)R                  S   SS9nU
R                  u  ppU R                  XU	5      u  nn
USS2USS24   nUSS2US-   US-   U R                  -   2SS24   nU
R                  SS5      R                  XX5      n
U R                  (       a  Uc  U R!                  U
5      nO?U R                   u  nnnnnUu  nnU" U" U" U
5      U-   5      5      nU" U" U5      U-   5      n[#        U R                  5       Vs/ s H"  nU R$                  U   " USS2USS24   5      PM$     nn[        R&                  " USS9nUR                  u  ppUUR                  XX-  5      -  R                  USX5      nU R)                  U5      nU R                   (       a$  US:X  d   eU R+                  USS2SSS24   5      n O"SUR-                  UR                  S   S5      -  n UUUU 4$ s  snf )	zYPredict instance segmentation masks from image and prompt embeddings using a transformer.r   r;   r   r=   z@image_pe should have size 1 in batch dim (from `get_dense_pe()`)Nr   g      $@)rn   r>   r?   ro   r@   r   r   rA   rB   rC   rD   r   r   rE   rF   rq   r   r!   r"   rG   r#   rv   new_ones)!r$   r-   r.   r/   r0   r~   r   srH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   dc1ln1act1dc2act2feat_s0feat_s1rT   rU   rV   r4   r5   r   s!                                    r*   r1   SAM2MaskDecoder.predict_masks}  sE    !II((//NN))$$++
 M A!IIt~~'<'<d>N>N>U>U&V\]^M%//299:R:X:XYZ:[]_acdMD!L ))*:LLOQRSC#))!,Q???"C+~~a A%i'ii%))(LLOKYY
a ""38C1a7QAQ1E1E)E FIJ mmAq!&&qQ2))->-F!%!6!6s!;(,(=(=%CdC0GW!%c#c(W*<&=!>!%c*<&=&G!H RWW[WkWkQl-
QlAD**1-oaAg.FGQl 	 -
 ;;}!4'--
a.33A!%@@FFq"aS ++M:6M6"&":":2aAg;"G #'):):8>>!;La)P"Ph1DDD!-
s   ,)Mc                   UR                  S5      nU R                  n[        R                  " X:  SS9R	                  5       n[        R                  " X* :  SS9R	                  5       n[        R
                  " US:  X4-  S5      $ )zNCompute mask stability scores based on IoU between upper and lower thresholds.r=   r;   r   g      ?)flattenrx   r>   sumfloatwhere)r$   mask_logitsstability_deltaarea_iarea_us        r*   _get_stability_scores%SAM2MaskDecoder._get_stability_scores  sq    !))"-@@;8bAGGI;)99rBHHJ{{6A:v<<r9   c                6   USS2SS2SS2SS24   nUSS2SS24   n[         R                  " USS9n[         R                  " UR                  S   UR                  S9nX6U4   nUR                  S5      nXFU4   nUR                  S5      nUSS2SS2SS2SS24   n	USS2SS24   n
U R                  U	5      nXR                  :  n[         R                  " US   R                  U	5      U	U5      n[         R                  " UR                  U
5      U
U5      nX4$ )a  
Dynamically select the most stable mask output based on stability scores and IoU predictions.

This method is used when outputting a single mask. If the stability score from the current single-mask
output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
(based on output tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask
for both clicking and tracking scenarios.

Args:
    all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
        batch size, N is number of masks (typically 4), and H, W are mask dimensions.
    all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).

Returns:
    mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
    iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).

Examples:
    >>> decoder = SAM2MaskDecoder(...)
    >>> all_mask_logits = torch.rand(2, 4, 256, 256)  # 2 images, 4 masks each
    >>> all_iou_scores = torch.rand(2, 4)
    >>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
    >>> print(mask_logits.shape, iou_scores.shape)
    torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
Nr   r=   r;   r   )device).NN)
r>   argmaxarangerC   r   rA   r   ry   r   	expand_as)r$   all_mask_logitsall_iou_scoresmultimask_logitsmultimask_iou_scoresbest_scores_inds
batch_indsbest_multimask_logitsbest_multimask_iou_scoressinglemask_logitssinglemask_iou_scoresstability_scores	is_stablemask_logits_outiou_scores_outs                  r*   r   0SAM2MaskDecoder._dynamic_multimask_via_stability  sD   6 +1ab!Q;7-ae4 <<(<"E\\"6"<"<Q"?H]H]^
 0=M1M N 5 ? ? B$8EU9U$V!$=$G$G$J! ,AqsAqL9 .q!A#v 6556GH$(O(OO	  ++o&001BC!

  56!%

 ..r9   )rs   rt   rx   ry   rw   r#   r   r   r   r   ro   r"   r   rv   rn   r   r   rq   rp   )r   rX   r   rY   r   rX   r%   rZ   r&   rX   r'   rX   rq   r^   rn   r^   r{   r^   rp   r^   r[   r\   )N)r-   r]   r.   r]   r/   r]   r0   r]   r3   r^   r~   r^   r   list[torch.Tensor] | Noner[   =tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])r-   r]   r.   r]   r/   r]   r0   r]   r~   r^   r   r   r[   r   )r`   ra   rb   rc   rd   r   re   r   r7   r1   r   r   rf   rg   rh   s   @r*   rj   rj      s   *` &'&(gg#&&+#((-*.+/ %$)05[U[U [U  #	[U
 $[U [U ![U  $[U [U "[U *.[U  
![U [UJ 8<CD&CD CD #/	CD
 ".CD CD CD 5CD 
GCDX 8<EE&EE EE #/	EE
 ".EE EE 5EE 
GEEN=5/ 5/r9   rj   )

__future__r   r>   r   ultralytics.nn.modulesr   r   Moduler   rj    r9   r*   <module>r      s8    #   3`")) `FS/bii S/r9   