
    cCi?!                     p    S r SSKrSSKJrJr  SSKJr  SSKJrJ	r	J
r
JrJr  SSKJr   " S S	\5      rS	/rg)
z!
Processor class for LayoutLMv2.
    N)OptionalUnion   )ProcessorMixin)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypec            (         ^  \ rS rSrSrSS/rSrSrS!U 4S jjr                  S"S\	\
\\\
   \\   4   S	\\	\\\   4      S
\\	\\\      \\\\         4      S\\	\\   \\\      4      S\S\	\\\4   S\	\\\4   S\\   S\S\\   S\\   S\\   S\S\S\S\S\S\\	\\4      S\4&S jjrS r\S 5       r\S 5       r\S 5       rS rU =r$ )#LayoutLMv2Processor   a@  
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
single processor.

[`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.

It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
[`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).

Args:
    image_processor (`LayoutLMv2ImageProcessor`, *optional*):
        An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
    tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
        An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
image_processor	tokenizerLayoutLMv2ImageProcessor)LayoutLMv2TokenizerLayoutLMv2TokenizerFastc                    > S nSU;   a,  [         R                  " S[        5        UR                  S5      nUb  UOUn[        TU ]  X5        g )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.)warningswarnFutureWarningpopsuper__init__)selfr   r   kwargsr   	__class__s        n/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/layoutlmv2/processing_layoutlmv2.pyr   LayoutLMv2Processor.__init__3   sQ     &(MM
 !'

+> ?-<-H/N_4    text	text_pairboxesword_labelsadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                    U R                   R                  (       a  Ub  [        S5      eU R                   R                  (       a  Ub  [        S5      eUSL a  USL a  [        S5      eU R                  UUS9nUb;  U R                   R                  (       a   Uc  [        U[        5      (       a  U/nUS   nU R
                  " S0 S	Ub  UOUS   _S
Ub  UOS_SUb  UOUS   _SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_UD6nUR                  S5      nUSL a  U R                  UUS   5      nUUS'   U$ )a  
This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
[`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.

Please refer to the docstring of the above two methods for more information.
NzdYou cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True.zaYou cannot provide word labels if you initialized the image processor with apply_ocr set to True.TFzKYou cannot return overflowing tokens without returning the offsets mapping.)imagesr4   wordsr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   pixel_valuesoverflow_to_sample_mappingimage )r   	apply_ocr
ValueError
isinstancestrr   r   get_overflowing_images)r   r7   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r   featuresencoded_inputss                          r    __call__LayoutLMv2Processor.__call__A   s   D ))u/@v  )){/Fs  %,1G51Pjkk ''vn'U  4 4 > >9CT$$$v )I 
)x/@
#,#8id
 !,%(72C
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
. n-$,00Hd9efF"(wr"   c                     / nU H  nUR                  X   5        M     [        U5      [        U5      :w  a#  [        S[        U5       S[        U5       35      eU$ )Nz`Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got z and )appendlenr>   )r   r7   r:   images_with_overflow
sample_idxs        r    rA   *LayoutLMv2Processor.get_overflowing_images   sr    !4J ''(:; 5 #$,F(GG,-.eC8R4S3TV 
 $#r"   c                 
    / SQ$ )N)	input_idsbboxtoken_type_idsattention_maskr;   r<   r   s    r    model_input_names%LayoutLMv2Processor.model_input_names   s    QQr"   c                 P    [         R                  " S[        5        U R                  $ )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r   r   r   image_processor_classrQ   s    r    feature_extractor_class+LayoutLMv2Processor.feature_extractor_class   s"    u	
 )))r"   c                 P    [         R                  " S[        5        U R                  $ )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r   r   r   r   rQ   s    r    r   %LayoutLMv2Processor.feature_extractor   s"    i	
 ###r"   r<   )NN)NNNNTFFNr   NNNFFFFTN)__name__
__module____qualname____firstlineno____doc__
attributesrU   tokenizer_classr   r   r
   r	   listr   intboolr@   r   r   r   r   rD   rA   propertyrR   rV   r   __static_attributes____classcell__)r   s   @r    r   r      s   & $[1J6HO5" _cQUIMCG#'5:;@$(,00404*/+0',#;?)T I0$y/4HYCZZ[T E"3T:K5L"LMN	T
 d49otDcO/DDEFT eDItDI$>?@T !T tS/12T $%778T SMT T %SMT  (~T  (~T $(T  %)!T" !%#T$ %T& 'T( !sJ!78)T, 
-Tl$ R R * * $ $r"   r   )r^   r   typingr   r   processing_utilsr   tokenization_utils_baser   r   r	   r
   r   utilsr   r   __all__r<   r"   r    <module>rl      s7     " . w w \$. \$~ !
!r"   