
    cCiWM                        S r SSKrSSKrSSKJrJr  SSKrSSKJ	r	  SSK
JrJr  SSKJrJrJrJr  SSKJrJrJrJrJrJrJr  SS	KJrJrJrJr  SS
KJ r   \" 5       (       a  SSK!r!SSK"J#r#J$r$J%r%  \" 5       (       a  SSK&r&\RN                  " \(5      r)Sr*S r+         S S\,S\-S\,S\,S\-S\-S\-S\-S\\.   S\\,   S\#RF                  4S jjr/ S!S\R`                  S\,S\\\,\14      4S jjr2 " S S\5      r3S/r4g)"z%Image processor class for Pix2Struct.    N)OptionalUnion)hf_hub_download   )BaseImageProcessorBatchFeature)convert_to_rgb	normalizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputget_image_sizeinfer_channel_dimension_formatmake_flat_list_of_imagesto_numpy_arrayvalid_images)
TensorTypeis_torch_availableis_vision_availablelogging)requires_backends)Image	ImageDraw	ImageFontzybelkada/fontsc                    [        [        S/5        U R                  S5      n [        R                  R
                  R                  XU4X4S9nUR                  U R                  S5      U R                  S5      XS5      nUR                  SSSSS5      R                  U R                  S5      U-  U R                  S5      U-  U R                  S5      U-  U-  5      nUR                  S5      $ )	a  
Utility function to extract patches from a given image tensor. Returns a tensor of shape
(1, `rows`, `columns`, `num_channels`x `patch_height` x `patch_width`).

Args:
    image_tensor (torch.Tensor):
        The image tensor to extract patches from.
    patch_height (int):
        The height of the patches to extract.
    patch_width (int):
        The width of the patches to extract.
torchr   )stride         r   )
r   torch_extract_patches	unsqueezer   nn
functionalunfoldreshapesizepermute)image_tensorpatch_heightpatch_widthpatchess       t/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/pix2struct/image_processing_pix2struct.pyr#   r#   4   s     +gY7))!,Lhh!!((k7R\h[v(wGool//2L4E4Ea4H,eghGooaAq!,44!,!+!|+k9G
 Q    text	text_size
text_colorbackground_colorleft_paddingright_paddingtop_paddingbottom_padding
font_bytes	font_pathreturnc
                 J   [        [        S5        [        R                  " SS9n
U
R	                  U S9nSR                  U5      nUb  U	c  [        R                  " U5      nOU	b  U	nO[        [        S5      n[        R                  " USUS9n[        R                  " [        R                  " S	S
U5      5      nUR!                  SX5      u    nnnUU-   U-   nUU-   U-   n[        R                  " S	UU4U5      n[        R                  " U5      nUR#                  XF4XUS9  U$ )aC  
Render text. This script is entirely adapted from the original script that can be found here:
https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py

Args:
    text (`str`, *optional*, defaults to ):
        Text to render.
    text_size (`int`, *optional*, defaults to 36):
        Size of the text.
    text_color (`str`, *optional*, defaults to `"black"`):
        Color of the text.
    background_color (`str`, *optional*, defaults to `"white"`):
        Color of the background.
    left_padding (`int`, *optional*, defaults to 5):
        Padding on the left.
    right_padding (`int`, *optional*, defaults to 5):
        Padding on the right.
    top_padding (`int`, *optional*, defaults to 5):
        Padding on the top.
    bottom_padding (`int`, *optional*, defaults to 5):
        Padding on the bottom.
    font_bytes (`bytes`, *optional*):
        Bytes of the font to use. If `None`, the default font will be used.
    font_path (`str`, *optional*):
        Path to the font to use. If `None`, the default font will be used.
visionP   )width)r1   
z	Arial.TTFzUTF-8)encodingr)   RGB)r   r   r   r   )xyr1   fillfont)r   render_texttextwrapTextWrapperwrapjoinioBytesIOr   DEFAULT_FONT_PATHr   truetyper   Drawr   newtextbboxr1   )r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   wrapperlineswrapped_textrF   	temp_draw_
text_widthtext_heightimage_widthimage_heightimagedraws                         r/   rG   rG   O   s   L k8, "",GLLdL#E99U#L)"3zz*%		0+>dW9ED uyy8HIJI$-$6$6v|$R!Aq*k |+m;K,~=LIIek<8:JKE>>% DII,,<W[I\Lr0   r\   headerinput_data_formatc                 h   [        [        S5        [        XS9n [        U40 UD6n[	        UR
                  U R
                  5      n[        U R                  XPR
                  -  -  5      n[        UR                  XTR
                  -  -  5      n[        R                  " SXVU-   4S5      nUR                  UR                  XW45      S5        UR                  U R                  XV45      SU45        [        U5      n[        U5      [        R                  :X  a  [!        U[        R                  5      nU$ )a  
Renders the input text as a header on the input image.

Args:
    image (`np.ndarray`):
        The image to render the header on.
    header (`str`):
        The header text.
    data_format (`Union[ChannelDimension, str]`, *optional*):
        The data format of the image. Can be either "ChannelDimension.channels_first" or
        "ChannelDimension.channels_last".

Returns:
    `np.ndarray`: The image with the header rendered.
r=   )r_   rB   whiterC   r   )r   render_headerr   rG   maxr?   intheightr   rQ   pasteresizer   r   r   LASTr   )	r\   r^   r_   kwargsheader_image	new_width
new_heightnew_header_height	new_images	            r/   rb   rb      s   $ mX. DEv00LL&&4IU\\Y%<=>JL//9?Q?Q3QRS		%):K-K!LgVIOOL''(FGPOOELL)!89A?P;QR y)I%i04D4I4II/	;K;P;PQ	r0   c                     ^  \ rS rSrSrSS/r     SS\S\S\\\	\
4      S	\
S
\SS4U 4S jjjr SS\R                  S	\
S\S\\\	\4      S\R                  4
S jjr  SS\R                  S\\\	\4      S\\\	\4      S\R                  4S jjrSSSSSS\R$                  S4S\S\\	   S\\   S\\   S	\\
   S\\\	\
4      S\\\	\4      S\S\\\	\4      S\4S jjrSrU =r$ )Pix2StructImageProcessor   a&  
Constructs a Pix2Struct image processor.

Args:
    do_convert_rgb (`bool`, *optional*, defaults to `True`):
        Whether to convert the image to RGB.
    do_normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
        method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
        deviation.
    patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
        The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
    max_patches (`int`, *optional*, defaults to 2048):
        The maximum number of patches to extract from the image as per the [Pix2Struct
        paper](https://huggingface.co/papers/2210.03347).
    is_vqa (`bool`, *optional*, defaults to `False`):
        Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
        rendered onto the input images.
flattened_patchesattention_maskNdo_convert_rgbdo_normalize
patch_sizemax_patchesis_vqar;   c                 t   > [         TU ]  " S0 UD6  Ub  UOSSS.U l        X l        Xl        X@l        XPl        g )N   )re   r?    )super__init__rv   ru   rt   rw   rx   )selfrt   ru   rv   rw   rx   ri   	__class__s          r/   r}   !Pix2StructImageProcessor.__init__   s@     	"6"(2(>*r\^D_(,&r0   r\   r_   c           	         [        U R                  S5        [        U[        R                  U5      n[
        R                  " U5      nUS   US   pv[        U[        R                  5      u  p[        R                  " X&U-  -  Xy-  -  5      n
[        [        [        R                  " X-  U-  5      U5      S5      n[        [        [        R                  " X-  U-  5      U5      S5      n[        X-  S5      n[        X-  S5      n[
        R                  R                  R                  UR!                  S5      X4SSSS	9R#                  S5      n[%        XU5      nUR&                  nUS   nUS
   nUS   nUR)                  UU-  U/5      n[
        R*                  " U5      R)                  US/5      R-                  SU5      R)                  UU-  S/5      n[
        R*                  " U5      R)                  SU/5      R-                  US5      R)                  UU-  S/5      nUS-  nUS-  nUR/                  [
        R0                  5      nUR/                  [
        R0                  5      n[
        R2                  " UUU/S5      n[
        R                  R                  R5                  USSSUUU-  -
  /5      R7                  5       n[9        U5      nU$ )ar  
Extract flattened patches from an image.

Args:
    image (`np.ndarray`):
        Image to extract flattened patches from.
    max_patches (`int`):
        Maximum number of patches to extract.
    patch_size (`dict`):
        Dictionary containing the patch height and width.

Returns:
    result (`np.ndarray`):
        A sequence of `max_patches` flattened patches.
r   re   r?   r   r   bilinearFT)r)   modealign_corners	antialiasr"   r   r    )r   extract_flattened_patchesr   r   FIRSTr   
from_numpyr   mathsqrtrc   minfloorr%   r&   interpolater$   squeezer#   shaper(   arangerepeattofloat32catpadfloatr   )r~   r\   rw   rv   r_   ri   r,   r-   r[   rZ   scalenum_feasible_rowsnum_feasible_colsresized_heightresized_widthr.   patches_shaperowscolumnsdepthrow_idscol_idsresults                          r/   r   2Pix2StructImageProcessor.extract_flattened_patches   s   . 	$88'B ,E3C3I3IK\]  '$.x$8*W:Mk$25:J:P:P$Q! 		+)DEIbcdDJJu/Cl/R$SU` acdeDJJu/B[/P$QS^ _abc.=qA-;Q?##//OOA 0 0 
 '!* 	 ([IQ"a  //4'>5"9: ,,t$,,dAY7>>q'JRRTX[bTbdeSfg,,w'//G=DDT1MUUW[^eWeghVij 	11 **U]]+**U]]+ GWg6; $$((!Q;$QX.;Y1Z[aac'r0   data_formatc           	      x   UR                   [        R                  :X  a  UR                  [        R                  5      n[        R
                  " U5      n[        R                  " U5      n[        US[        R                  " [        R                  " UR                  5      5      -  5      n[        U4UUUUS.UD6$ )a  
Normalize an image. image = (image - image_mean) / image_std.

The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization

Args:
    image (`np.ndarray`):
        Image to normalize.
    data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format for the output image. If unset, the channel dimension format of the input
        image is used.
    input_data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format of the input image. If not provided, it will be inferred.
g      ?)meanstdr   r_   )dtypenpuint8astyper   r   r   rc   r   r   prodr   r
   )r~   r\   r   r_   ri   r   r   adjusted_stddevs           r/   r
   "Pix2StructImageProcessor.normalize5  s    , ;;"(("LL,E wwu~ffUmc32775;;3G)H#HI
#/
 
 	
r0   imagesheader_textreturn_tensorsc
                 4   Ub  UOU R                   nUb  UOU R                  nUb  UOU R                  nUb  UOU R                  nU R                  nU
R                  S5      b  [        S5      e[        U5      n[        U5      (       d  [        S5      eU(       a  U Vs/ s H  n[        U5      PM     nnU Vs/ s H  n[        U5      PM     nnU	c  [        US   5      n	U(       a  Uc  [        S5      eU
R                  SS5      nU
R                  SS5      n[        U[        5      (       a  U/[        U5      -  n[!        U5       VVs/ s H  u  p[#        XU   XS	9PM     nnnU(       a  U Vs/ s H  oR%                  XS
9PM     nnU Vs/ s H  nU R'                  XXiS9PM     nnU Vs/ s H1  oR)                  SS9S:g  R+                  [,        R.                  5      PM3     nn[1        UUS.US9nU$ s  snf s  snf s  snnf s  snf s  snf s  snf )as
  
Preprocess an image or batch of images. The processor first computes the maximum possible number of
aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
images are standardized following the tensorflow implementation of `per_image_standardization`
(https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).


Args:
    images (`ImageInput`):
        Image to preprocess. Expects a single or batch of images.
    header_text (`Union[list[str], str]`, *optional*):
        Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
    do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
        Whether to convert the image to RGB.
    do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
        Whether to normalize the image.
    max_patches (`int`, *optional*, defaults to `self.max_patches`):
        Maximum number of patches to extract.
    patch_size (`dict`, *optional*, defaults to `self.patch_size`):
        Dictionary containing the patch height and width.
    return_tensors (`str` or `TensorType`, *optional*):
        The type of tensors to return. Can be one of:
            - Unset: Return a list of `np.ndarray`.
            - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
            - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
            - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
    data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
        The channel dimension format for the output image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - Unset: Use the channel dimension format of the input image.
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Nr   z8data_format is not an accepted input as the outputs are zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.r   z.A header text must be provided for VQA models.r9   r:   )r9   r:   )r\   r_   )r\   rw   rv   r_   r    )axis)rr   rs   )datatensor_type)ru   rt   rv   rw   rx   get
ValueErrorr   r   r	   r   r   pop
isinstancestrlen	enumeraterb   r
   r   sumr   r   r   r   )r~   r   r   rt   ru   rw   rv   r   r   r_   ri   rx   r\   r9   r:   iattention_masksencoded_outputss                     r/   
preprocess#Pix2StructImageProcessor.preprocess\  s@   j (4'?|TEVEV+9+E4K^K^#-#9Zt
%0%<k$BRBR::m$0WXX)&1F##:  9?@nU+F@ 6<<VE.'V<$ >vay I" !QRRL$7J

;5I+s++*mc&k9 !*& 1 1HA e^
` 1  
 djkdj[`nn5nVdjFk  	
   ** +   	 	 
 V\\U[EII2I.!3;;BJJGU[\&'-Q_m
 S A = l
 ]s$   G;:H H4HH28H)rt   ru   rx   rw   rv   )TTNi   FN)NN)__name__
__module____qualname____firstlineno____doc__model_input_namesboolr   dictr   rd   r}   r   ndarrayr   r   r   r
   r   r   r   r   __static_attributes____classcell__)r   s   @r/   rp   rp      s   ( -.>?  $!/3  T#s(^,	
   
 * EIOzzO O 	O
 $E#/?*?$@AO 
Oh ?CDH	%
zz%
 eC)9$9:;%
 $E#/?*?$@A	%
 
%
T &*)-'+%)/3;?(8(>(>DHqq c]q !	q
 tnq c]q T#s(^,q !sJ!78q &q $E#/?*?$@Aq 
q qr0   rp   )	$   blackra      r   r   r   NNr   )5r   rL   r   typingr   r   numpyr   huggingface_hubr   image_processing_utilsr   r   image_transformsr	   r
   r   r   image_utilsr   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   rH   PILr   r   r   r   
get_loggerr   loggerrN   r#   r   rd   bytesrG   r   ChildProcessErrorrb   rp   __all__r{   r0   r/   <module>r      sp   , 	  "  + F d d   R Q 3 //			H	%$  : #"&#@
@@ @ 	@
 @ @ @ @ @ }@ [[@J bf'::'"'7?cK\F\@]7^'TP1 Pf &
&r0   