
    cCi'                         S r SSKrSSKJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJrJrJrJr  SSKJrJr  SSKJrJr  SS	KJrJr  SS
KJrJrJr  SSKJr   " S S\5      r \" S\S5       " S S\5      5       r!S/r"g)z#video processor class for GLM-4.1V.    N)OptionalUnion   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimensionPILImageResamplingSizeDictget_image_size)UnpackVideosKwargs)
TensorTypeadd_start_docstrings)BASE_VIDEO_PROCESSOR_DOCSTRINGBaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos   )smart_resizec                       \ rS rSr% Sr\\\4   \S'   Sr	\
\   \S'   Sr\
\   \S'   Sr\
\   \S'   Sr\
\\      \S'   Sr\
\\      \S'   S	rg)
Glm4vVideoProcessorInitKwargs'   Nmax_image_size
patch_sizetemporal_patch_size
merge_size
image_mean	image_std )__name__
__module____qualname____firstlineno__r   dictstrint__annotations__r   r   r   r   r   listfloatr    __static_attributes__r!       j/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/glm4v/video_processing_glm4v.pyr   r   '   se    %)NDcN) $J$)-#- $J$(,Je%,'+IxU$+r-   r   zfConstructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.aj  
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    c                      ^  \ rS rSr\R
                  rSSS.rSS0r\	r
\rSrSrSrSrSrSrSrS	rSr\rS
rSrSS/rS\\   4U 4S jjr S&S\\   S\4U 4S jjjr  S&S\!S\\"\#\$4      4S jjr%SSS\R
                  SSSSSSSSS4S\&\'RP                     S\)S\)S\\   S\S\)S\$S\)S\\"\$\&\$   4      S\\"\$\&\$   4      S \\#   S!\\#   S"\\#   S#\\"\*\+4      4S$ jjr,S%r-U =r.$ )'Glm4vVideoProcessor0   i 1  i )shortest_edgelongest_edger3   T      i,     pixel_values_videosvideo_grid_thwkwargsc                    > [         TU ]  " S0 UD6  U R                  bF  U R                  R                  SS 5      b  U R                  R                  SS 5      c  [	        S5      eg g )Nr2   r3   :size must contain 'shortest_edge' and 'longest_edge' keys.r!   )super__init__sizeget
ValueError)selfr9   	__class__s     r.   r=   Glm4vVideoProcessor.__init__Q   s^    "6"99 IIMM/408DIIMM.Z^<_<gYZZ =h !r-   Nr>   returnc                 \   > Ub  SU;  d  SU;  a  [        S5      e[        TU ]  " SSU0UD6$ )z
Update kwargs that need further processing before being validated
Can be overridden by subclasses to customize the processing of kwargs.
r2   r3   r;   r>   r!   )r@   r<   _further_process_kwargs)rA   r>   r9   rB   s      r.   rF   +Glm4vVideoProcessor._further_process_kwargsX   s>     !<VZ@ZYZZw.CDCFCCr-   metadatafpsc                     Ub  [        USS5      c  [        S5      eUR                  nUb  UOU R                  nUS-
  nUR                  =(       d    [        XaR                  -  5      S-   nXpR                  ::  as  [        [        R                  " Xu-  5      5      n[        U5       V	s/ s H;  n	[        U[        [        R                  " XR                  -  U-  5      5      5      PM=     n
n	O[        U R                  U-  5      nX:  a  [        [        U5      5      n
O[[        R                  " SX{SS9nU Vs/ s H8  n[        U[        [        R                  " XR                  -  5      5      5      PM:     n
n[!        5       / pU
 H-  nUU;  d  M  UR#                  U5        UR%                  U5        M/     ['        U5      S-  (       a  UR%                  US   5        [        R(                  " U5      $ s  sn	f s  snf )	a?  
Args:
    metadata (`VideoMetadata`):
        Metadata of the video containing information about total duration, fps and total number of frames.
    fps (`int` or `float`, *optional*):
        Target frames to sample per second. Defaults to `self.fps`.
Returns:
    np.ndarray:
        Indices to sample video frames.
NrI   zAsked to sample frames per second but no video metadata was provided which is required when sampling in GLM4V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`r   r   T)endpoint)getattrr@   total_num_framesrI   durationroundmax_durationr(   mathfloorrangeminceilr*   nplinspacesetaddappendlenarray)rA   rH   rI   r9   total_framesrequested_fpsmax_frame_idxrO   niframe_indicesnum_samplestarget_secondstseenuniqidxs                    r.   sample_frames!Glm4vVideoProcessor.sample_framesf   s     wx=EX 
  00"DHH$q($$Omll.J(Ka(O(((DJJx789AkpqrkstksfgSDIIa,,>NQ^>^4_0`aksMtMd//-?@K* $U<%8 9!#QPT!U_m n_mZ[]C		!llBR8S4T!U_m nUBd C$C  !
 t9q=KKR!xx~% u !os   )AG6:?G;gp?videosdo_convert_rgb	do_resizeinterpolation
do_rescalerescale_factordo_normalizer   r    r   r   r   return_tensorsc                    [        U5      u  nn0 nUR                  5        H  u  nnUR                  u  nnnnnUUUnnnU(       am  [        UUUUX-  UR                  UR
                  S9u  nnUR                  UU-  UUU5      nU R                  U[        UUS9US9nUR                  UUUUU5      nUUU'   M     [        UU5      n[        U5      u  nn0 n 0 n!UR                  5        GH  u  nn[        US   [        R                  S9u  nnU R                  UXgXU
5      nUn"U"R                  S   U-  S:w  a8  U"S S 2SS 24   R                  SUS-
  SSS5      n#[        R                   " U"U#/SS9n"U"R                  S S	 u  n$n%n&U%U-  n%UU-  UU-  n(n'U"R                  U$U%UU&U'U-  UUU(U-  UU5
      n"U"R#                  SSS
SSSS	SSS5
      n"U"R%                  U$U%U'-  U(-  U&U-  U-  U-  5      n)U)U U'   U%U'U(//U$-  U!U'   GM     [        U U5      n*[        U!U5      n![        R                   " U*SS9n+[        R&                  " U!5      n,U+U,S.n-[)        U-US9$ )N)
num_framesheightwidthtemporal_factorfactor
min_pixels
max_pixels)rv   rw   )r>   ro   r   )channel_dimr   rL   )dimr               r5      	   )r7   r8   )datatensor_type)r   itemsshaper   r2   r3   viewresizer   r   r   r	   FIRSTrescale_and_normalizerepeattorchcatpermutereshapetensorr   ).rA   rl   rm   rn   r>   ro   rp   rq   rr   r   r    r   r   r   rs   r9   grouped_videosgrouped_videos_indexresized_videos_groupedr   stacked_videosBTCHWru   rv   rw   resized_heightresized_widthresized_videosprocessed_videos_groupedprocessed_gridspatchesrepeats
batch_sizegrid_tchannelgrid_hgrid_wflatten_patchesprocessed_videosr7   r8   r   s.                                                 r.   _preprocessGlm4vVideoProcessor._preprocess   s   $ 0EV/L,,!#%3%9%9%;!E>*00MAq!Q()1aJ0<)!$7%2#11#001- "0!4!4QUAq!!D!%"!}M"/ "- "
 "0!4!4Q1nm!\,:"5)) &<* ((>@TU 0E^/T,,#% %3%9%9%;!E>,:>!;LZjZpZp,q)NM "77
LV_N %G }}Q"55:!!RS&/004G!4KQPQSTU))Wg$6A>*1--*;'J22F+z9=J;VFFll#*$*$G ooaAq!Q1aCG%oo&(--
:ZGO />$U+'-vv&>%?*%LOE"K &<N **BDXY(:NO#ii(8a@o6#6,

 >BBr-   r!   )N)/r"   r#   r$   r%   r
   BICUBICresampler>   r   r   r   r   r    rn   rp   rr   rm   do_sample_framesr   r   rQ   r   r   valid_kwargsru   rI   model_input_namesr   r=   r   r   r&   rF   r   r   r(   r+   rj   r*   r   Tensorboolr'   r   r   r,   __classcell__)rB   s   @r.   r0   r0   0   s    "))H&8KLD$&9:N!JIIJLNJLJ0LJ
C.0@A[(E!F [ $(Dx D 
	D D" ,000 eCJ'(0j  $#',>,F,F )!:>9=$(-1$(;?aCU\\"aC aC 	aC
 x aC *aC aC aC aC U5$u+#567aC E%e"456aC SMaC &c]aC SMaC !sJ!78aC aCr-   r0   )#__doc__rR   typingr   r   numpyrW   r   image_processing_utilsr   image_utilsr   r   r	   r
   r   r   processing_utilsr   r   utilsr   r   video_processing_utilsr   r   video_utilsr   r   r   image_processing_glm4vr   r   r0   __all__r!   r-   r.   <module>r      s    *  "   2  5 5 X O O 0,L , l"}C, }C}C@ !
!r-   