
    cCi                      p    S r SSKrSSKJrJr  SSKJr  SSKJrJ	r	J
r
JrJr  SSKJr   " S S	\5      rS	/rg)
z!
Processor class for LayoutLMv3.
    N)OptionalUnion   )ProcessorMixin)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypec            (         ^  \ rS rSrSrSS/rSrSrS!U 4S jjr                  S"S\	\
\\\
   \\   4   S	\\	\\\   4      S
\\	\\\      \\\\         4      S\\	\\   \\\      4      S\S\	\\\4   S\	\\\4   S\\   S\S\\   S\\   S\\   S\S\S\S\S\S\\	\\4      S\4&S jjrS r\S 5       r\S 5       r\S 5       rS rU =r$ )#LayoutLMv3Processor   a>  
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
single processor.

[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.

It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).

Args:
    image_processor (`LayoutLMv3ImageProcessor`, *optional*):
        An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
    tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
        An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
image_processor	tokenizerLayoutLMv3ImageProcessor)LayoutLMv3TokenizerLayoutLMv3TokenizerFastc                    > S nSU;   a,  [         R                  " S[        5        UR                  S5      nUb  UOUn[        TU ]  X5        g )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.)warningswarnFutureWarningpopsuper__init__)selfr   r   kwargsr   	__class__s        n/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.pyr   LayoutLMv3Processor.__init__3   sQ     &(MM
 !'

+> ?-<-H/N_4    text	text_pairboxesword_labelsadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                 `   U R                   R                  (       a  Ub  [        S5      eU R                   R                  (       a  Ub  [        S5      eU R                  UUS9nUb;  U R                   R                  (       a   Uc  [        U[        5      (       a  U/nUS   nU R
                  " S0 SUb  UOUS   _SUb  UOS_SUb  UOUS   _S	U_S
U_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_UD6nUR                  S5      nUSL a  U R                  UUS   5      nUUS'   U$ )a  
This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
[`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
`apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
resized and normalized `pixel_values`.

Please refer to the docstring of the above two methods for more information.
NzdYou cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True.zaYou cannot provide word labels if you initialized the image processor with apply_ocr set to True.)imagesr4   wordsr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   pixel_valuesToverflow_to_sample_mapping )r   	apply_ocr
ValueError
isinstancestrr   r   get_overflowing_images)r   r7   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r   featuresencoded_inputss                          r    __call__LayoutLMv3Processor.__call__A   s   F ))u/@v  )){/Fs 
 ''vn'U  4 4 > >9CT$$$v )I 
)x/@
#,#8id
 !,%(72C
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
. n-$,00Hd9efF)/~&r"   c                     / nU H  nUR                  X   5        M     [        U5      [        U5      :w  a#  [        S[        U5       S[        U5       35      eU$ )Nz`Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got z and )appendlenr=   )r   r7   r:   images_with_overflow
sample_idxs        r    r@   *LayoutLMv3Processor.get_overflowing_images   sr    !4J ''(:; 5 #$,F(GG,-.eC8R4S3TV 
 $#r"   c                 
    / SQ$ )N)	input_idsbboxattention_maskr9   r;   r   s    r    model_input_names%LayoutLMv3Processor.model_input_names   s    FFr"   c                 P    [         R                  " S[        5        U R                  $ )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r   r   r   image_processor_classrO   s    r    feature_extractor_class+LayoutLMv3Processor.feature_extractor_class   s"    u	
 )))r"   c                 P    [         R                  " S[        5        U R                  $ )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r   r   r   r   rO   s    r    r   %LayoutLMv3Processor.feature_extractor   s"    i	
 ###r"   r;   )NN)NNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____firstlineno____doc__
attributesrS   tokenizer_classr   r   r
   r	   listr   intboolr?   r   r   r   r   rC   r@   propertyrP   rT   r   __static_attributes____classcell__)r   s   @r    r   r      s   & $[1J6HO5" _cQUIMCG#'5:;?$(,00404*/+0',#;?)R I0$y/4HYCZZ[R E"3T:K5L"LMN	R
 d49otDcO/DDEFR eDItDI$>?@R !R tS/12R $%778R SMR R %SMR  (~R  (~R $(R  %)!R" !%#R$ %R& 'R( !sJ!78)R, 
-Rh$ G G * * $ $r"   r   )r\   r   typingr   r   processing_utilsr   tokenization_utils_baser   r   r	   r
   r   utilsr   r   __all__r;   r"   r    <module>rj      s7     " . w w Z$. Z$z !
!r"   