
    hY                       S SK Jr  S SKrS SKJs  Jr  S SKJr  S SKJr  S SK	J
r
  S SKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJrJr  Sr " S S\R2                  5      r " S S\R                  R2                  5      rg)    )annotationsN)nn)trunc_normal_)MLP)LOGGER   )SAM2TwoWayTransformer)MaskDecoderSAM2MaskDecoder)ImageEncoderViTPromptEncoder)get_1d_sine_peselect_closest_cond_framesg      c                  d   ^  \ rS rSr% SrSrS\S'     S	           S
U 4S jjjrS rSr	U =r
$ )SAMModel   aw  
Segment Anything Model (SAM) for object segmentation tasks.

This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
and input prompts.

Attributes:
    mask_threshold (float): Threshold value for mask prediction.
    image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
    prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
    mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
    pixel_mean (torch.Tensor): Mean values for normalizing pixels in the input image.
    pixel_std (torch.Tensor): Standard deviation values for normalizing pixels in the input image.

Methods:
    set_imgsz: Set image size to make model compatible with different image sizes.

Examples:
    >>> image_encoder = ImageEncoderViT(...)
    >>> prompt_encoder = PromptEncoder(...)
    >>> mask_decoder = MaskDecoder(...)
    >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
    >>> # Further usage depends on SAMPredictor class

Notes:
    All forward() operations are implemented in the SAMPredictor class.
        floatmask_thresholdc                &  > [         TU ]  5         Xl        X l        X0l        U R                  S[        R                  " U5      R                  SSS5      S5        U R                  S[        R                  " U5      R                  SSS5      S5        g)aj  
Initialize the SAMModel class to predict object masks from an image and input prompts.

Args:
    image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
    prompt_encoder (PromptEncoder): Encodes various types of input prompts.
    mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
    pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
    pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.

Examples:
    >>> image_encoder = ImageEncoderViT(...)
    >>> prompt_encoder = PromptEncoder(...)
    >>> mask_decoder = MaskDecoder(...)
    >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
    >>> # Further usage depends on SAMPredictor class

Notes:
    All forward() operations moved to SAMPredictor.

pixel_meanr   F	pixel_stdN)	super__init__image_encoderprompt_encodermask_decoderregister_buffertorchTensorview)selfr   r   r   r   r   	__class__s         \/home/james-whalen/.local/lib/python3.13/site-packages/ultralytics/models/sam/modules/sam.pyr   SAMModel.__init__8   sy    8 	*,(\5<<
+C+H+HQPQ+RTYZ[%,,y*A*F*Fr1a*PRWX    c                
   [        U R                  S5      (       a  U R                  R                  U5        XR                  l        U Vs/ s H  o"S-  PM	     snU R                  l        US   U R                  l        gs  snf )CSet image size to make model compatible with different image sizes.	set_imgsz   r   N)hasattrr   r*   r   input_image_sizeimage_embedding_sizeimg_sizer#   imgszxs      r%   r*   SAMModel.set_imgsz[   sl    4%%{33((//4,EJ3KUGU3K0&+Ah# 4Ls   B )r   r   r   ))g33333^@gR]@gRY@)g(\2M@g(\L@g     L@)r   r   r   r   r   r
   r   list[float]r   r4   returnNone)__name__
__module____qualname____firstlineno____doc__r   __annotations__r   r*   __static_attributes____classcell__r$   s   @r%   r   r      ss    8  NE #<!8!Y&!Y &!Y "	!Y
  !Y !Y 
!Y !YF/ /r'   r   c                  >  ^  \ rS rSr% SrSrS\S'                                   S               SU 4S jjjr\S 5       r	S r
S	 r    SS
 jrSS jrSS jrS r SS jrS rS rS r   SS jrS r\S 5       rSS jrS rSrU =r$ )	SAM2Modeld   a  
SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.

This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
for temporal consistency and efficient tracking of objects across frames.

Attributes:
    mask_threshold (float): Threshold value for mask prediction.
    image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
    memory_attention (nn.Module): Module for attending to memory features.
    memory_encoder (nn.Module): Encoder for generating memory representations.
    num_maskmem (int): Number of accessible memory frames.
    image_size (int): Size of input images.
    backbone_stride (int): Stride of the backbone network output.
    sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
    sam_image_embedding_size (int): Size of SAM image embeddings.
    sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
    sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
    obj_ptr_proj (nn.Module): Projection layer for object pointers.
    obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.
    hidden_dim (int): Hidden dimension of the model.
    mem_dim (int): Memory dimension for encoding features.
    use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
    use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
    max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder cross-attention.
    add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers.
    proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
        encoding in object pointers.
    use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in temporal positional encoding.
    only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
        evaluation.
    pred_obj_scores (bool): Whether to predict if there is an object in the frame.
    pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
    fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
    soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
    use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
    no_obj_embed_spatial (torch.Tensor | None): No-object embedding for spatial frames.
    max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
    directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
        first frame.
    multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
        conditioning frames.
    multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
    multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
    multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
    use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
    iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
    memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
    non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
        memory encoder during evaluation.
    sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
    sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
    binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
        with clicks during evaluation.
    use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
        prompt encoder and mask decoder on frames with mask input.

Methods:
    forward_image: Process image batch through encoder to extract multi-level features.
    track_step: Perform a single tracking step, updating object masks and memory features.
    set_binarize: Set binarize for VideoPredictor.
    set_imgsz: Set image size to make model compatible with different image sizes.

Examples:
    >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
    >>> image_batch = torch.rand(1, 3, 512, 512)
    >>> features = model.forward_image(image_batch)
    >>> track_results = model.track_step(0, True, features, None, None, None, {})
r   r   r   c$                  > [         T$U ]  5         Xl        Xl        U(       a  SOSU l        UU l        UU l        U(       a%  [        R                  R                  SSSSS9U l
        UU l        U(       a	  U(       d   eUU l        UU l        UU l        X l        UR                   U l        X0l        U R"                  U l        [)        U R$                  S5      (       aW  [)        U R$                  R*                  S5      (       a2  U R$                  R*                  R,                  R.                  S   U l        X@l        [        R                  R3                  [        R4                  " USSU R&                  5      5      U l        [9        U R6                  SS	9  [        R                  R3                  [        R4                  " SSU R"                  5      5      U l        [        R                  R3                  [        R4                  " SSU R"                  5      5      U l        [9        U R:                  SS	9  [9        U R<                  SS	9  Xl        Xpl         Xl!        Xl"        UU l#        UU l$        Xl%        Xl&        Xl'        UU l(        UU l)        UU l*        UU l+        XPl,        X`l-        U"U l.        UU l/        UU l0        UU l1        UU l2        U Rb                  (       a&  U R^                  (       d   eU R
                  (       d   eU R^                  (       ah  U R
                  (       aW  [        R                  R3                  [        R4                  " SU R"                  5      5      U l3        [9        U Rf                  SS	9  U U l4        S
U l5        U!(       aW  [        R                  R3                  [        R4                  " SU R&                  5      5      U l5        [9        U Rj                  SS	9  U Rm                  5         Xl7        U#(       aQ  [p        Rr                  " S5        [        Rt                  " U R                  Rv                  SSSS9U R                  l;        g
g
)a  
Initialize the SAM2Model for video object segmentation with memory-based tracking.

Args:
    image_encoder (nn.Module): Visual encoder for extracting image features.
    memory_attention (nn.Module): Module for attending to memory features.
    memory_encoder (nn.Module): Encoder for generating memory representations.
    num_maskmem (int): Number of accessible memory frames.
    image_size (int): Size of input images.
    backbone_stride (int): Stride of the image backbone output.
    sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
    sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
    binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
        with clicks during evaluation.
    use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
        prompt encoder and mask decoder on frames with mask input.
    max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
    directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
        first frame.
    use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
    multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
        conditioning frames.
    multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
    multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
    multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
    use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
    iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
    memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
    non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
        memory encoder during evaluation.
    use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
    max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
        cross-attention.
    add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
        the encoder.
    proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
        encoding in object pointers.
    use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in the temporal positional encoding
        in the object pointers.
    only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
        during evaluation.
    pred_obj_scores (bool): Whether to predict if there is an object in the frame.
    pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
    fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
    soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
    use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
    no_obj_embed_spatial (bool): Whether add no obj embedding to spatial frames.
    sam_mask_decoder_extra_args (dict | None): Extra arguments for constructing the SAM mask decoder.
    compile_image_encoder (bool): Whether to compile the image encoder for faster inference.

Examples:
    >>> image_encoder = ImageEncoderViT(...)
    >>> memory_attention = SAM2TwoWayTransformer(...)
    >>> memory_encoder = nn.Sequential(...)
    >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
    >>> image_batch = torch.rand(1, 3, 512, 512)
    >>> features = model.forward_image(image_batch)
    >>> track_results = model.track_step(0, True, features, None, None, None, {})
   r      )kernel_sizestrideout_projweightr   g{Gz?)stdNzFImage encoder compilation is enabled. First forward pass will be slow.zmax-autotuneTF)mode	fullgraphdynamic)<r   r   r   use_high_res_features_in_samnum_feature_levelsuse_obj_ptrs_in_encodermax_obj_ptrs_in_encoderr    r   Conv2dmask_downsampleadd_tpos_enc_to_obj_ptrsproj_tpos_enc_in_obj_ptrsuse_signed_tpos_enc_to_obj_ptrs"only_obj_ptrs_in_the_past_for_evalmemory_attentiond_model
hidden_dimmemory_encodermem_dimr,   rH   rI   shapenum_maskmem	Parameterzerosmaskmem_tpos_encr   no_mem_embedno_mem_pos_encdirectly_add_no_mem_embedsigmoid_scale_for_mem_encsigmoid_bias_for_mem_enc"binarize_mask_from_pts_for_mem_encnon_overlap_masks_for_mem_encmemory_temporal_stride_for_eval$use_mask_input_as_output_without_sammultimask_output_in_sammultimask_min_pt_nummultimask_max_pt_nummultimask_output_for_trackinguse_multimask_token_for_obj_ptriou_prediction_use_sigmoid
image_sizebackbone_stridesam_mask_decoder_extra_argspred_obj_scorespred_obj_scores_mlpfixed_no_obj_ptrsoft_no_obj_ptr
no_obj_ptruse_mlp_for_obj_ptr_projno_obj_embed_spatial_build_sam_headsmax_cond_frames_in_attnr   infocompileforward)%r#   r   rX   r[   r^   rq   rr   re   rf   rg   rj   r|   rd   rN   rk   rl   rm   rn   ro   rp   ri   rh   rP   rQ   rT   rU   rV   rW   rt   ru   rv   rw   ry   rz   rs   compile_image_encoderr$   s%                                       r%   r   SAM2Model.__init__   sU   B 	 +,H)'C!'>$'>$" $)88??1aQq?#QD (@%$+++)B&/N,2T/ !1*22 -4&&
33@S@S@\@\^f8g8g..77>>DDQGDL& % 2 25;;{AqRVR^R^3_ `d++6!HH..u{{1a/QR#hh00Q4??1STd''T2d))t4)B& *C&(@%2T/-J*/N, 5Y1'>$$8!$8!-J*/N,*D' %.+F(.#6  0.  ''''////D$@$@#hh00Q1PQDO$//t4(@%$(!(-(:(:5;;q$,,;W(XD%$33>'>$ !KK`a).""**#	*D& !r'   c                H    [        U R                  5       5      R                  $ )z=Return the device on which the model's parameters are stored.)next
parametersdevicer#   s    r%   r   SAM2Model.devicel  s     DOO%&---r'   c                    [        S5      e)zWProcess image and prompt inputs to generate object masks and scores in video sequences.zPlease use the corresponding methods in SAM2VideoPredictor for inference.See notebooks/video_predictor_example.ipynb for an example.)NotImplementedError)r#   argskwargss      r%   r   SAM2Model.forwardq  s    !J
 	
r'   c                (   U R                   U l        U R                  U R                  -  U l        [        U R                  U R                  U R                  4U R                  U R                  4SS9U l        [        SS[        SU R                  SSS9U R                  SSU R                  U R                  U R                  U R                  U R                  S	.
U R                  =(       d    0 D6U l        U R                   (       a|  ["        R$                  R'                  U R                   U R                   5      U l        U R*                  (       a1  [-        U R                   U R                   U R                   S5      U l        O#["        R$                  R/                  5       U l        U R0                  (       a:  ["        R$                  R'                  U R                   U R2                  5      U l        g
["        R$                  R/                  5       U l        g
)zMBuild SAM-style prompt encoder and mask decoder for image segmentation tasks.r+   )	embed_dimr.   r-   mask_in_chansrD      i      )depthembedding_dimmlp_dim	num_heads   )
num_multimask_outputstransformertransformer_dimiou_head_depthiou_head_hidden_dimuse_high_res_featuresrp   rt   ru   ro   N )rZ   sam_prompt_embed_dimrq   rr   sam_image_embedding_sizer   sam_prompt_encoderr   r	   rN   rp   rt   ru   ro   rs   sam_mask_decoderrP   r    r   Linearobj_ptr_projry   r   IdentityrU   r\   obj_ptr_tpos_projr   s    r%   r{   SAM2Model._build_sam_headsx  s   $(OO!(,4;O;O(O% #0//----" #oot?#
 !0 !
"#-"77	 !55 #"&"C"C'+'F'F 00 $ 8 8,0,P,P!
  //52!!
$ '' % QD,,$'$//[\$]! % 1 1 3D)) &+XX__T__dll%SD"%*XX%6%6%8D"r'   c           
        UR                   S   nUR                  nUR                  S5      U R                  :X  d   eUR                  S5      U R                  :X  d   eUR                  S5      U R                  :X  d   eUb3  US   nUS   n	UR                   S   U:X  a  U	R                   S   U:X  d   eOG[
        R                  " USSXqR                  S9n[
        R                  " US[
        R                  US	9* n	Ub  [        UR                   5      S
:X  a  UR                   SS US4:X  d   eUR                   SS U R                  R                  :w  a;  [        R                  " UR                  5       U R                  R                  SSSS9n
OUn
OSn
U R                  X4SU
S9u  pU R!                  UU R                  R#                  5       UUUSUS9u  pnnU R$                  (       a)  US:  n[
        R&                  " USS2SS4   U[(        5      n[        R                  " UU R*                  U R*                  4SSS9nUSS2S4   nU(       ar  [
        R,                  " USS9n[
        R.                  " XgS9nUUU4   R1                  S5      nUUU4   R1                  S5      nUR                  S5      S:  a  UUU4   nOUUnnU R3                  U5      nU R$                  (       ah  U R4                  (       a  UR7                  5       nOWR9                  UR                  5      nU R:                  (       a  UU-  nUSU-
  U R<                  -  -   nUUUUUUU4$ )aC	  
Forward pass through SAM prompt encoders and mask heads.

This method processes image features and optional point/mask inputs to generate object masks and scores.

Args:
    backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
    point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts.
        'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
            pixel-unit coordinates in (x, y) format for P input points.
        'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
            0 means negative clicks, and -1 means padding.
    mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
        same spatial size as the image.
    high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes
        (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
        for SAM decoder.
    multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
        output only 1 mask and its IoU estimate.

Returns:
    low_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
    high_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
    ious (torch.Tensor): Tensor of shape (B, M) with estimated IoU for each output mask.
    low_res_masks (torch.Tensor): Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
    high_res_masks (torch.Tensor): Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
    obj_ptr (torch.Tensor): Tensor of shape (B, C) with object pointer vector for the output mask.
    object_score_logits (torch.Tensor): Tensor of shape (B) with object score logits.

Examples:
    >>> backbone_features = torch.rand(1, 256, 32, 32)
    >>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
    >>> mask_inputs = torch.rand(1, 1, 512, 512)
    >>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
    >>> (
    ...     low_res_multimasks,
    ...     high_res_multimasks,
    ...     ious,
    ...     low_res_masks,
    ...     high_res_masks,
    ...     obj_ptr,
    ...     object_score_logits,
    ... ) = results
r   r   r   rD   Npoint_coordspoint_labelsr   dtype)r   r   rE   FbilinearTsizealign_cornersrK   	antialias)pointsboxesmasks)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputrepeat_imagehigh_res_features)r   rK   r   r   dimr   )r]   r   r   r   r   r    r`   r   onesint32lenr   mask_input_sizeFinterpolater   r   get_dense_pert   whereNO_OBJ_SCORErq   argmaxarange	unsqueezer   rw   sigmoidtorv   rx   )r#   backbone_featurespoint_inputsmask_inputsr   r   Br   sam_point_coordssam_point_labelssam_mask_promptsparse_embeddingsdense_embeddingslow_res_multimasksioussam_output_tokensobject_score_logitsis_obj_appearinghigh_res_multimaskssam_output_tokenbest_iou_inds
batch_indslow_res_maskshigh_res_masksobj_ptrlambda_is_obj_appearings                             r%   _forward_sam_headsSAM2Model._forward_sam_heads  s   h ##A&")) %%a(D,E,EEEE %%a(D,I,IIII %%a(D,I,IIII #+N;+N;#))!,16F6L6LQ6OST6TTT6T  %{{1a6I`I`a %

1au{{6 RR " {(()Q.;3D3DRa3HQPQF3RRR  %)@)@)P)PP"#--%%'00@@"'#"# #. #O.2.E.E$7! /F /
+
 LPK`K`.,,99;%6$4-/ La L
H"35H 2Q6 "'-=atm-LN`bn!o  mm//4??3	
 -QT2!LL26Ma7J.z=/HISSTUVM0]1JKUUVWXN %%a(1,#4Z5N#O ,>@S>M ##$45##*=*E*E*G'*:*=*=gmm*L'$$1G;%<!< OOG
 	
r'   c                $   Su  pEUR                  5       nXd-  U-   n[        R                  " UUR                  S5      S-  UR                  S5      S-  4SSSS9nUR	                  UR
                  S	   S
5      R                  5       n	U R                  (       a  Ub  Uc8  [        R                  " UR
                  S	   U R                  UR                  S9n
O'U R                  UU R                  U5      US9u          pn[        R                  " UR                  S
5      R                  5       S:  S
S9nUS   nUR                  5       nXM-  U-   nU R                  (       a*  U R                   (       a  X-  n
U
S
U-
  U R"                  -  -   n
UUU	UUU
U4$ )zFProcess mask inputs directly as output, bypassing SAM encoder/decoder.)g      4@      $r   rE   r   Fr   Tr   r   r   r   )r   r   r   r   r   ).N)r   r   r   r   new_onesr]   rP   r    r`   rZ   r   r   rS   anyflattenrt   rv   rx   )r#   r   r   r   	out_scaleout_biasmask_inputs_floatr   r   r   r   _r   r   r   s                  r%   _use_mask_as_outputSAM2Model._use_mask_as_output>  s    *	'--/*6A %%b)Q.0C0CB0G10LM
 ##K$5$5a$8!<BBD++/@/HL]Lekk+"3"3A"6P[PbPbcG )-(?(?"3 001BC"3 )@ )%Aq!QA !99[%8%8%;%A%A%Cc%IqQ+I6"2"8"8":'AHL$$1;%<!< OOG 
 	
r'   c                    U R                  U5      nU R                  (       aN  U R                  R                  US   S   5      US   S'   U R                  R	                  US   S   5      US   S'   U$ )zRProcess image batch through encoder to extract multi-level features for SAM model.backbone_fpnr   r   )r   rN   r   conv_s0conv_s1)r#   	img_batchbackbone_outs      r%   forward_imageSAM2Model.forward_imagem  s{    )))4,, /3.C.C.K.KLYgLhijLk.lL(+.2.C.C.K.KLYgLhijLk.lL(+r'   c                   [        US   5      [        US   5      :X  d   e[        US   5      U R                  :  d   eUS   U R                  * S nUS   U R                  * S nU Vs/ s H   oDR                  S   UR                  S   4PM"     nnU Vs/ s H$  oDR                  S5      R	                  SSS5      PM&     nnU Vs/ s H$  oDR                  S5      R	                  SSS5      PM&     nnXX54$ s  snf s  snf s  snf )	zZPrepare and flatten visual features from the image backbone output for further processing.r   vision_pos_encNr   r   r   r   r   )r   rO   r]   r   permute)r#   r   feature_mapsvision_pos_embedsr2   
feat_sizesvision_featss          r%   _prepare_backbone_features$SAM2Model._prepare_backbone_featuresw  s   </0CEU8V4WWWW</0D4K4KKKK#N3T5L5L4L4NO()9:D<S<S;S;UV:KL:KQwwr{AGGBK0:K
L?KL|!		!,,Q15|LDUVDUqYYq\11!Q:DUV+<HH MLVs   )'C8+C=+Dc	                   US   R                  S5      n	U R                  n
US   u  pUS   R                  nU R                  S:X  a&  US   R	                  SSS5      R                  XX5      $ SnU(       a  SOSnU(       Gd  / / nn[        US   5      S:  d   eUS   n[        UUU R                  5      u  nnUR                  5        Vs/ s H  nSU4PM	     nnU R                  (       a  SOU R                  n[        SU R                  5       H  nU R                  U-
  nUS:X  a  U(       a  UU-   OUU-
  nO6U(       d  US-
  U-  U-  nUUS-
  U-  -
  nOUS-   * U-  * U-  nUUS-
  U-  -   nUS   R                  US5      nUc  UR                  US5      nUR                  UU45        M     U H  u  nnUc  M  US   R                  XR                   S	:H  S
9nUR                  UR#                  S5      R	                  SSS5      5        US   S   R                  US9nUR#                  S5      R	                  SSS5      nUU R$                  U R                  U-
  S-
     -   nUR                  U5        M     U R&                  (       Ga  [)        XpR*                  5      nU R                  (       dP  U R,                  (       a?  UR/                  5        VVs0 s H!  u  nnU(       a	  UU:  d  M  OUU::  d  M  UU_M#     n nnOUn U R/                  5        VVs/ s H2  u  nnU R0                  (       a  UU-
  U-  O[3        UU-
  5      US   4PM4     n!nn[        SU5       He  n"U(       a  UU"-   OUU"-
  nUS:  d	  Ub  UU:  a    ODUS   R                  UUR                  US5      5      nUc  MO  U!R                  U"US   45        Mg     U!(       Ga  [5        U!6 u  n#n$[6        R8                  " U$SS9n%U R:                  (       a  US-
  n&U R<                  (       a  U
OU R>                  n'[6        R@                  " U#XS   RB                  S9n([E        U(U&-  U'S9n(U RG                  U(5      n(U(RI                  S5      RK                  SXR>                  5      n(O%U%RM                  [        U#5      XR>                  5      n(U R>                  U
:  aj  U%RO                  SXU R>                  -  U R>                  5      n%U%R	                  SSSS5      R#                  SS5      n%U(RQ                  XR>                  -  SS9n(UR                  U%5        UR                  U(5        U%RR                  S   nOSnOU RT                  (       a7  US   U RV                  -   n)U)R	                  SSS5      R                  XX5      n)U)$ U RV                  RK                  SXR>                  5      /nU RX                  RK                  SXR>                  5      /n[6        RZ                  " USS9n*[6        RZ                  " USS9n+U R]                  UUU*U+US9n)U)R	                  SSS5      R                  XX5      n)U)$ s  snf s  snnf s  snnf )zePrepare memory-conditioned features by fusing current frame's visual features with previous memories.r   r   r   r   cond_frame_outputsnon_cond_frame_outputsNmaskmem_featurescuda)r   non_blockingmaskmem_pos_encr   r   r   r   rD   )currcurr_posmemory
memory_posnum_obj_ptr_tokens)/r   rZ   r   r^   r   r"   r   r   r|   valuestrainingri   rangegetappendr   typer   ra   rP   minrQ   rW   itemsrV   abszipr    stackrT   rU   r\   tensorr   r   r   r   expand	new_zerosreshaperepeat_interleaver]   rd   rb   rc   catrX   ),r#   	frame_idxis_init_cond_framecurrent_vision_featscurrent_vision_pos_embedsr   output_dict
num_framestrack_in_reverser   CHWr   r  tpos_sign_multo_cat_memoryto_cat_memory_pos_embedcond_outputsselected_cond_outputsunselected_cond_outputsoutt_pos_and_prevsrt_post_relprev_frame_idxprevfeatsmaskmem_encrQ   tptr_cond_outputspos_and_ptrst_diffpos_list	ptrs_listobj_ptrs
t_diff_maxtpos_dimobj_pospix_feat_with_memr
  memory_pos_embeds,                                               r%   $_prepare_memory_conditioned_features.SAM2Model._prepare_memory_conditioned_features  s    !$))!,OO"~%b)00 q '+33Aq!<AA!MM.A!572M {#789A===&';<L=W<)E)E>:!#: 4I3O3O3QR3QC3x3QOR
 ]](L(LAq$"2"23((50A::JY%6PY\aPaN) (11}&:a%?N%3uqyAo%EN *3Q'71'<%=%AN%3uqyAo%EN!":;??PTU; 255ndKC&&s|4- 40  /t< /0336P[P[_ePe3f$$U]]1%5%=%=aA%FG"#45b9<<F<K)11!4<<Q1E)D,A,A$BRBRUZBZ]^B^,__'..{;  / +++*-j:V:V*W' }})P)P '<&A&A&C(&CFAs.>AN *AN 3&C % ($ (=$ #3"8"8":  #;3  $CC ']m;!$Y]!3I #;    $A'>?F.>	F*IPVDVA1u!7AO%&>?CCAG^GbGbcdfjGklC$++VS^,DE @  *-|*<'Hi${{9!<H 44%<q%@
(,(F(F1DLL"',,xceNfNlNl"m"0:1E8"T"&"8"8"A")"3"3A"6"="=b!\\"R"*"4"4S]A||"T||a'#+#3#3B;Ldll#[#+#3#3Aq!Q#?#G#G1#M")";";A<MST";"U!((2+227;)1):&)*& --$8$<t?P?P$P!$5$=$=aA$F$K$KARS$W!(( "..55aLLIJM'+':':'A'A!Q'U&V# =a0 99%<!D 11%.'1 2 
 .55aA>CCA!O  A Sd( s   YY$0Y$9Y$9Y*c                   US   R                  S5      nU R                  nUS   u  pUS   R                  SSS5      R                  XgX5      n
U R                  (       a"  U R
                  (       d  U R                  U5      nU R                  =(       a    UnU(       a0  U R
                  (       d  US:  R                  U
R                  5      nO[        R                  " U5      nU R                  S:w  a  XR                  -  nU R                  S:w  a  XR                  -   nU R                  XSS9nUS	   nUS
   nU R                  bE  US:  R!                  5       nUSUS   -
  U R                  S   R"                  " UR$                  6 -  -  nX4$ )zXEncode frame features and masks into a new memory representation for video segmentation.r   r   r   r         ?r   T)skip_mask_sigmoidvision_featuresr   ).NN)r   rZ   r   r"   rh   r  "_apply_non_overlapping_constraintsrg   r   r   r    r   re   rf   r[   rz   r   r  r]   )r#   r   r   pred_masks_high_resr   is_mask_from_ptsr   r%  r&  r'  pix_featbinarizemask_for_memmaskmem_outr  r  r   s                    r%   _encode_new_memorySAM2Model._encode_new_memory*  s    !$))!,OO"~'+33Aq!<AA!M--dmm #'"I"IJ]"^::O?ODMM/!377GL !==)<=L))S0'*H*HHL((C/'*G*GGL))(TX)Y&'89%&67 $$0 3a 7>>@%5o%F!F$JcJcKfK&,,K. !. .  00r'   c                d   [        U5      S:  ah  [        USS USS 5       VVs/ s HH  u  pUR                  SSS5      R                  " UR	                  S5      UR	                  S5      /UQ76 PMJ     nnnOSnUb[  U R
                  (       aJ  US   R                  SSS5      nUR                  " SU R                  /US   Q76 nU R                  XU5      nOPU R                  UUUSS USS USS UU	U
S9nUb
  Ub  Ub   eUnU R                  X&5      nU R                  UUUUUS9nUX4$ s  snnf )hPerform a single tracking step, updating object masks and memory features based on current frame inputs.r   Nr   r   r   )r  r  r   r!  r   r"  r#  r$  )r   r   r   r   r   )r   r  r   r"   r   rj   rZ   r   rC  _use_multimaskr   )r#   r  r  r   r!  r   r   r   r"  r#  r$  prev_sam_mask_logitsr2   sr   rL  sam_outputsr   s                     r%   _track_stepSAM2Model._track_stepV  s     #$q(   4Sb 9:cr?K!KDA 		!Q"''q	166!9AqAK  !
 !%"t'P'P ,B/771a@H}}RJ:b>JH22;J[\K @@##5%9"#%>*CBC*H%bc?'%!1 A 	H $/#/K4GGG2#223ET11"*)'"3!1 2 K -77O!s   AD,c                    U(       a0  U R                   S:  a   U R                  UUUUUSLS9u  pXS'   XS'   gSUS'   SUS'   g)z^Run memory encoder on predicted mask to encode it into a new memory feature for future frames.r   N)r   r   rJ  r   rK  r  r  )r^   rP  )
r#   r   r   r   run_mem_encoderr   r   current_outr  r  s
             r%   _encode_memory_in_output"SAM2Model._encode_memory_in_output  sl     t//!3040G0G%9%$2$7".d": 1H 1- /?*+-<)*.2K*+-1K)*r'   c                    U R                  UUUUUUUUU	U
U5      u  n  nUu      pnnnUUUS.nU R                  (       d  UUS'   U R                  UUUUUUU5        U$ )rS  )
pred_masksrJ  r   r   )rX  r  r]  )r#   r  r  r   r!  r   r   r   r"  r#  r$  r[  rU  rW  r   r   r   r   r   r\  s                       r%   
track_stepSAM2Model.track_step  s    , !,, % 
Q P[L1a9L (#1

 }} 2EK-. 	%% 	
 r'   c                    Uc  SOUS   R                  S5      nU R                  =(       aE    U=(       d    U R                  =(       a)    U R                  Us=:*  =(       a    U R                  :*  $ s  $ )zaDetermine whether to use multiple mask outputs in the SAM head based on configuration and inputs.r   r   r   )r   rk   rn   rl   rm   )r#   r  r   num_ptss       r%   rT  SAM2Model._use_multimask  sn    #+!n1M1R1RST1U(( T#It'I'IT**gRR9R9RR	
 S	
r'   c           	        U R                   S   nUS:X  a  U $ U R                  n[        R                  " U SSS9n[        R                  " XS9SS2SSS4   nX4:H  n[        R
                  " XP[        R                  " U SS95      n U $ )	z\Apply non-overlapping constraints to masks, keeping the highest scoring object per location.r   r   T)r   keepdimr   Nr   )max)r]   r   r    r   r   r   clamp)r`  
batch_sizer   max_obj_indsbatch_obj_indskeeps         r%   rI  ,SAM2Model._apply_non_overlapping_constraints  s      %%a(
?""||JAtDj@D$PTATU- [[5;;zu3UV
r'   c                    Xl         g)z Set binarize for VideoPredictor.N)rg   )r#   rM  s     r%   set_binarizeSAM2Model.set_binarize   s    2:/r'   c                    US   U l         XR                  l        U Vs/ s H  o"S-  PM	     snU R                  l        U R                   U R                  -  U l        gs  snf )r)   r   r+   N)rq   r   r-   r.   rr   r   r0   s      r%   r*   SAM2Model.set_imgsz  sU    (380IN7OAR7O4(,4;O;O(O% 8Ps   A)0rT   rr   rg   rd   rv   rZ   r   rq   rp   rS   ra   r|   rQ   r\   rX   r[   ri   rm   rl   rn   rk   rb   rc   rz   rx   rh   rO   r^   r   r   rW   rt   ru   rU   r   r   rs   r   r   rf   re   rw   rN   rj   ry   ro   rP   rV   )    i   r+   rF  r   FFr   FFFr   r   FFFr   FFr+   TFFFFFFFFFNF)ro   boolrt   ru  ru   ru  rv   ru  rw   ru  ry   ru  rz   ru  r   ru  )NNNF)NN)r   ztorch.Tensor)F)FTN)r7   r8   r9   r:   r;   r   r<   r   propertyr   r   r{   r   r   r   r   rC  rP  rX  r]  ra  rT  staticmethodrI  rp  r*   r=   r>   r?   s   @r%   rA   rA   d   s   DL  NE "%!$+0-2 ""'%* %&+05#(()&+ % "!%"'(-+0 %$)!& %).%*$(&+I}& *.'}: ;}< "=}> ?}@ A}B #'C}D #E}H  $I} }~ . .
-9d U
n-
^I0 b!H*1X88t2H  !':x
  ";P Pr'   rA   )
__future__r   r    torch.nn.functionalr   
functionalr   torch.nn.initr   ultralytics.nn.modulesr   ultralytics.utilsr   blocksr	   decodersr
   r   encodersr   r   utilsr   r   r   Moduler   rA   r   r'   r%   <module>r     s]    #     ' & $ ) 2 4 = H/ryy H/VeP ePr'   