
    cCi-                        S r SSKJr  SSKJrJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJrJrJrJr  SSKJr  SSKJr  SSKJr  \R2                  " \5      r\\" SS9 " S S\5      5       5       r\ " S S\5      5       rSS jr " S S\R>                  5      r  " S S\R>                  5      r!\" SS9 " S S\5      5       r"SS/r#g)zPyTorch VitPose model.    )	dataclass)OptionalUnionN)nn   )BackboneOutput)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)load_backbone)can_return_tuple   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
VitPoseEstimatorOutput%   a$  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
    Heatmaps as predicted by the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
Nlossheatmaps.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       f/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   r   %   sq    	 )-D(5$$
%,,0Hhu(()0=AM8E%"3"3S"89:A:>Ju00#567>r&   r   c                       \ rS rSr% \\S'   SrSrSrS\	\
R                  \
R                  \
R                  4   4S jrSrg	)
VitPosePreTrainedModel=   configvitpixel_valuesTmodulec                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)
isinstancer   LinearConv2dinittrunc_normal_weightdatator!   float32r+   initializer_rangedtypebiaszero_	LayerNormfill_)selfr.   s     r'   _init_weights$VitPosePreTrainedModel._init_weightsD   s    fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S) .r&   r   N)r   r   r   r   r   r#   base_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r3   r4   r?   rB   r%   r   r&   r'   r)   r)   =   s=    $O&*#*E"))RYY*L$M *r&   r)   c                    US;  a  [        S5      eU R                  S:w  a  [        S5      eU R                  u  p4pVSnUS:X  a  SnU SS2SSS2S	4   * U SS2SSS2S	4'   U R                  US
XuU5      n U R	                  5       nUR                  5        H)  u  pU SS2U
S	4   USS2U	S	4'   U SS2U	S	4   USS2U
S	4'   M+     UR                  X4XV45      nUR                  S
5      nU$ )a  Flip the flipped heatmaps back to the original form.

Args:
    output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
        The output heatmaps obtained from the flipped images.
    flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
        Pairs of keypoints which are mirrored (for example, left ear -- right ear).
    target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
        Target type to use. Can be gaussian-heatmap or combined-target.
        gaussian-heatmap: Classification target with gaussian distribution.
        combined-target: The combination of classification target (response map) and regression target (offset map).
        Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

Returns:
    torch.Tensor: heatmaps that flipped back to the original image
)gaussian-heatmapcombined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   rI   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrights              r'   	flip_backr^   S   s$   " AATUUa^__/=/C/C,JvH''(6q!$Q$|(D'Dq!$Q$|$#++JHeTN(..0 "((*,:1eS=,IAtSL)-;AtSL-IAucM* + .55zRX6`a-2226r&   c                      ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	\R                     S\R                  4S jjr
S	rU =r$ )VitPoseSimpleDecoder{   z
Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
feature maps into heatmaps.
r+   c                   > [         TU ]  5         [        R                  " 5       U l        [        R
                  " UR                  SSS9U l        [        R                  " UR                  R                  UR                  SSSS9U l        g )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsamplerd   
upsamplingr4   backbone_confighidden_size
num_labelsconvrA   r+   	__class__s     r'   rl   VitPoseSimpleDecoder.__init__   se    '')++63F3FZglmII""..0A0AqYZde
	r&   hidden_staterT   returnc                     U R                  U5      nU R                  U5      nU R                  U5      nUb  [        X25      nU$ N)rn   rp   rt   r^   rA   rx   rT   r   s       r'   forwardVitPoseSimpleDecoder.forward   sA    |4|499\*! 6Hr&   )rn   rt   rp   r{   r   r   r   r   r    r   rl   r!   Tensorr   r}   r%   __classcell__rv   s   @r'   r`   r`   {   sG    

} 
	ELL 	hu||>T 	`e`l`l 	 	r&   r`   c                   x   ^  \ rS rSrSrS\4U 4S jjrS	S\R                  S\	\R                     4S jjr
SrU =r$ )
VitPoseClassicDecoder   z
Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
turning the feature maps into heatmaps.
r+   c           	        > [         TU ]  5         [        R                  " UR                  R
                  SSSSSS9U l        [        R                  " S5      U l        [        R                  " 5       U l
        [        R                  " SSSSSSS9U l        [        R                  " S5      U l        [        R                  " 5       U l        [        R                  " SUR                  SSSS9U l        g )	N   rJ      r   F)rh   ri   rj   r=   r   rg   )rk   rl   r   ConvTranspose2drq   rr   deconv1BatchNorm2d
batchnorm1rm   relu1deconv2
batchnorm2relu2r4   rs   rt   ru   s     r'   rl   VitPoseClassicDecoder.__init__   s    ))""..1VW^c
 ..-WWY
))#s!UV]bc..-WWY
IIc6#4#4!AWXY	r&   rx   rT   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nUb  [        X25      nU$ r{   )r   r   r   r   r   r   rt   r^   r|   s       r'   r}   VitPoseClassicDecoder.forward   sy    ||L1|4zz,/||L1|4zz,/99\*! 6Hr&   )r   r   rt   r   r   r   r   r{   r   r   s   @r'   r   r      s;    
Z} ZELL hu||>T  r&   r   z?
    The VitPose model with a pose estimation head on top.
    c                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S\
\R                     S\
\R                     S\
\R                     S\\   S	\4S
 jj5       5       rSrU =r$ )VitPoseForPoseEstimation   r+   c                   > [         TU ]  U5        [        U5      U l        [	        U R                  R
                  S5      (       d  [        S5      e[	        U R                  R
                  S5      (       d  [        S5      e[	        U R                  R
                  S5      (       d  [        S5      eUR                  (       a  [        U5      O
[        U5      U l
        U R                  5         g )Nrr   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)rk   rl   r   backbonehasattrr+   rL   use_simple_decoderr`   r   head	post_initru   s     r'   rl   !VitPoseForPoseEstimation.__init__   s     %f- t}}++];;OPPt}}++\::OPPt}}++\::NOO4:4M4M(0ShioSp	 	r&   r-   dataset_indexrT   labelskwargsry   c                 ~   SnUb  [        S5      eU R                  R                  " U4SU0UD6nUR                  S   nUR                  S   n	U R
                  R                  R                  S   U R
                  R                  R                  S   -  n
U R
                  R                  R                  S   U R
                  R                  R                  S   -  nUR                  SSS5      nUR                  U	SX5      R                  5       nU R                  XS9n[        UUUR                  UR                  S	9$ )
a  
dataset_index (`torch.Tensor` of shape `(batch_size,)`):
    Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

    This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
flip_pairs (`torch.tensor`, *optional*):
    Whether to mirror pairs of keypoints (for example, left ear -- right ear).

Examples:

```python
>>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
>>> import torch
>>> from PIL import Image
>>> import requests

>>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
>>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
>>> inputs = processor(image, boxes=boxes, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
>>> heatmaps = outputs.heatmaps
```NzTraining is not yet supportedr   rK   r   r   r   )rT   )r   r   r   r   )NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsrN   r+   rq   r   r   permuterO   
contiguousr   r   r   r   )rA   r-   r   rT   r   r   r   outputssequence_outputrV   patch_heightpatch_widthr   s                r'   r}    VitPoseForPoseEstimation.forward   s7   N %&EFF"&--"L"L#
'#
 #
 "..r2$**1-
{{22==a@DKKD_D_DjDjklDmmkk11<<Q?4;;C^C^CiCijkCll)11!Q:)11*b,\ggi99_9D%!//))	
 	
r&   )r   r   )NNN)r   r   r   r   r   rl   r   r   r!   r   r   r
   r   r   r}   r%   r   r   s   @r'   r   r      s    } $  15-1)->
ll>
  ->
 U\\*	>

 &>
 +,>
 
 >
  >
r&   r   )rH   )$r    dataclassesr   typingr   r   r!   r   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   r   r   utils.backbone_utilsr   utils.genericr   configuration_vitposer   
get_loggerr   loggerr   r)   r^   Moduler`   r   r   __all__r   r&   r'   <module>r      s     ! "   . - & M M 1 - 0 
		H	%
 
?[ ? ?$ *_ * **%P299 6#BII #L 
S
5 S

S
l $%?
@r&   