
    cCi                         S SK JrJr  S SKrSSKJr  SSKJrJ	r	  SSK
JrJrJrJrJr  SSKJrJr   " S S	\5      r " S
 S\SS9r " S S\5      rS/rg)    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)AudioKwargsImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   &    \ rS rSr% \\   \S'   Srg)Gemma3nImagesKwargs   do_convert_rgb N)__name__
__module____qualname____firstlineno__r   bool__annotations____static_attributes__r       h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/gemma3n/processing_gemma3n.pyr   r      s    TN"r   r   c                   6    \ rS rSr% \\S'   \\S'   SSS00rSrg)	Gemma3nProcessorKwargs   audio_kwargsimages_kwargstext_kwargspaddingFr   N)	r   r   r   r   r	   r   r   	_defaultsr   r   r   r   r   r      s"    &&u
Ir   r   F)totalc                      ^  \ rS rSrSr/ SQrSrSrSr   SS\	S\	4U 4S	 jjjr
    SS
\\   S\\\\\   \\   4   S\\\R$                  \\   \\R$                     \\\      4      S\\   S\4
S jjrSrU =r$ )Gemma3nProcessor(   a,  
A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
into a single processor.

Args:
    feature_extractor (`Gemma3nAudioFeatureExtractor`):
        Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
        should return a `BatchFeature` with `input_features` and `input_features_mask` features.
    image_processor (`SiglipImageProcessorFast`):
        Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
        with a `pixel_values` feature.
    tokenizer (`GemmaTokenizerFast`):
        The text tokenizer for the model.
    chat_template (`string`, *optional*):
        A Jinja template for generating text prompts from a set of messages.
    audio_seq_length (int, *optional*, defaults to 188):
        The number of audio soft tokens that will be added to the text prompt
    image_seq_length (int, *optional*, defaults to 256):
        The number of image soft tokens that should be added to
)feature_extractorimage_processor	tokenizerAutoFeatureExtractorAutoImageProcessorAutoTokenizeraudio_seq_lengthimage_seq_lengthc                   > XPl         UR                  U l        UR                  U l        UR                  U l        SR	                  UR                  /U-  5      nSUR                   U UR
                   S3U l        X`l        UR                  U l        UR                  U l	        UR                  U l
        SR	                  UR                  /U-  5      n	SUR                   U	 UR                   S3U l        [        T
U ]8  " SUUUUS.UD6  g )N z

)r*   r+   r,   chat_templater   )r0   audio_token_id	boa_tokenaudio_tokenjoin	eoa_tokenfull_audio_sequencer1   image_token_id	boi_tokenimage_token	eoi_tokenfull_image_sequencesuper__init__)selfr*   r+   r,   r4   r0   r1   kwargsaudio_tokens_expandedimage_tokens_expanded	__class__s             r   rA   Gemma3nProcessor.__init__C   s    !1'66",,$00 ")>)>(?BR(R S%))*=*=)>?T>UV_ViViUjjn#o  0'66",,$00 ")>)>(?BR(R S%))*=*=)>?T>UV_ViViUjjn#o  	
/+'		

 	
r   imagestextaudiorC   returnc           	         Uc  Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6n[        U[        5      (       a  U/nO8[        U[        5      (       d#  [        US   [        5      (       d  [        S5      eUbn  U R                  " U40 US   D6nU(       d  U Vs/ s H  oR                  PM     nnU V	s/ s H(  oR                  U R                  U R                  5      PM*     nn	O0 nUb  U R                  R                  U5      n[        U5      n
U R                  " U
40 US   D6nU(       d8  U
 Vs/ s H+  nSR                  U R                   /[#        U5      -  5      PM-     nn[#        U
5      [#        U5      :w  a$  [        S[#        U
5       S	[#        U5       S
35      eU V	s/ s H(  oR                  U R                   U R$                  5      PM*     nn	O0 nUS   R'                  SS 5      nU R                  " SSU0US   DSS0D6nU R)                  X-S/S9  US   n[*        R,                  " U5      nSXU R.                  :H  '   SXU R0                  :H  '   UR3                  5        VVs0 s H  u  nnUUR5                  5       _M     nnnUR5                  5       US'   [7        0 UEUEUEUS9$ s  snf s  sn	f s  snf s  sn	f s  snnf )Nz5Provide at least one of `text`, `images`, or `audio`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr!   r"    z1Received inconsistently sized batches of images (z) and text (z).r#   return_tensorsrI   npimage)
modalities	input_ids   r   token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r,   init_kwargs
isinstancestrlistr*   r7   replacer:   r+   fetch_imagesr   r8   r=   lenr?   pop_check_special_mm_tokensrP   
zeros_liker;   r5   itemstolistr   )rB   rH   rI   rJ   videosrC   output_kwargsaudio_inputs_promptbatched_imagesimage_inputsrO   text_inputs	array_idsrU   kvs                     r   __call__Gemma3nProcessor.__call__c   s    <FNu}TUU**"
"&.."<"<
 
 dC  6DD$''
47C0H0H`aa11%Y=;XYL278%Q((%8 ^bb]aSYNN4#3#3T5M5MN]aDbDL))66v>F7?N//a-P_B`aL Q_`Q_v$"2"2!3c&k!ABQ_`>"c$i/ GNH[G\\hilmqirhssuv 
 ^bb]aSYNN4#3#3T5M5MN]aDbDL&}599:JDQnnd$d-2Nd_cd%%dWI%N  ,	y1;<D$7$778;<D$7$7781<1B1B1DE1DAq!((*}1DE(6(=(=(?$%!PK!P<!P<!P^lmmK 9 c a c Fs   7K/K2K/K	K!)
r0   r7   r5   r6   r<   r:   r?   r1   r=   r;   )N      )NNNN)r   r   r   r   __doc__
attributesfeature_extractor_classimage_processor_classtokenizer_classintrA   r   r   r   r   r   r]   rP   ndarrayfloatr   r   r   rq   r   __classcell__)rF   s   @r   r(   r(   (   s    * GJ40%O  # #
 
 
 
D (,^b_c?n$?n I0$y/4HYCZZ[?n bjj$u+tBJJ7GdSXkIZZ[\	?n /0?n 
?n ?nr   r(   )typingr   r   numpyrP   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   r   tokenization_utils_baser   r   r   r   r(   __all__r   r   r   <module>r      sR     #  4 A c c C#, #-U zn~ znz 
r   