
    +ho7                         S SK r S SKJrJr  S SKrS SKrS SKJr  SSK	J
r
  \
" 5       (       a  S SKJr  S r " S S	5      r " S
 S5      rg)    N)DictList)Image   )is_torchvision_available)
transformsc                   ^ [        U R                  6 SU-  :  aS  U R                  [        S U R                   5       5      [        R
                  S9n [        U R                  6 SU-  :  a  MS  [        U R                  6 U:  aP  U[        U R                  6 -  mU R                  [        U4S jU R                   5       5      [        R                  S9n [        U R                  6 S:  aP  S[        U R                  6 -  mU R                  [        U4S jU R                   5       5      [        R                  S9n [        R                  " U 5      nUR                  S   S-  S-  nUR                  S   S-  U-
  nUR                  S   S-  S-  nUR                  S   S-  U-
  nX#UR                  S   U-
  2XRR                  S   U-
  24   n[        R                  " U5      $ )	z
Crop the image so that its height and width does not exceed `max_image_size`, while ensuring both the height and
width are multiples of 16.
   c              3   *   #    U  H	  oS -  v   M     g7f)r
   N ).0xs     g/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/omnigen/processor_omnigen.py	<genexpr>crop_image.<locals>.<genexpr>#   s     *J>a6>s   )resamplec              3   @   >#    U  H  n[        UT-  5      v   M     g 7fNroundr   r   scales     r   r   r   '        *T^5U+;+;^      c              3   @   >#    U  H  n[        UT-  5      v   M     g 7fr   r   r   s     r   r   r   +   r   r   r      )minsizeresizetupler   BOXmaxBICUBICnparrayshape	fromarray)	pil_imagemax_image_sizearrcrop_y1crop_y2crop_x1crop_x2r   s          @r   
crop_imager0      s   
 y~~
!n"4
4$$U*J9>>*J%JUZU^U^$_	 y~~
!n"4
4 INNn,inn!55$$U*TY^^*T%T_d_l_l$m	
INNb S)..))$$U*TY^^*T%T_d_l_l$m	
((9
Cyy|b Q&GiilR')Gyy|b Q&GiilR')G
		!w..))A,:P0PP
QC??3    c                       \ rS rSrSS\4S jjrS rS rS rS r	        SS	\
\   S
\
\
\      S\S\S\S\S\S\S\S\4S jjrSrg)OmniGenMultiModalProcessor8   r*   c           
         ^ Xl         TU l        [        R                  " [        R                  " U4S j5      [        R
                  " 5       [        R                  " / SQ/ SQSS9/5      U l        [        5       U l	        g )Nc                    > [        U T5      $ r   r0   r)   r*   s    r   <lambda>5OmniGenMultiModalProcessor.__init__.<locals>.<lambda>?       Jy.4Yr1         ?r=   r=   Tmeanstdinplace)
text_tokenizerr*   r   ComposeLambdaToTensor	Normalizeimage_transformOmniGenCollatorcollator)selfrB   r*   s     `r   __init__#OmniGenMultiModalProcessor.__init__9   sc    ,,)11!!"YZ##%$$/X\] 
 ()r1   c           
         ^ TU l         [        R                  " [        R                  " U4S j5      [        R                  " 5       [        R
                  " / SQ/ SQSS9/5      U l        g )Nc                    > [        U T5      $ r   r7   r8   s    r   r9   AOmniGenMultiModalProcessor.reset_max_image_size.<locals>.<lambda>K   r;   r1   r<   Tr>   )r*   r   rC   rD   rE   rF   rG   )rJ   r*   s    `r   reset_max_image_size/OmniGenMultiModalProcessor.reset_max_image_sizeG   sQ    ,)11!!"YZ##%$$/X\] 
r1   c                     [        U[        5      (       a%  [        R                  " U5      R	                  S5      nU R                  U5      $ )NRGB)
isinstancestrr   openconvertrG   )rJ   images     r   process_image(OmniGenMultiModalProcessor.process_imageQ   s9    eS!!JJu%--e4E##E**r1   c           	         U R                  U5      nUb  [        U5      S:X  a!  U R                  U5      nUR                  S S S.$ Sn[        R
                  " XA5       Vs/ s H  oPR                  U5      R                  PM     nn[        S[        U5      5       H  nXg   S   S:X  d  M  Xg   SS  Xg'   M     [        R                  " XA5      nU V	s/ s H2  n	[        U	R                  S5      S   R                  S5      S   5      PM4     n
n	[        [        U
5      5      nU[        [        S[        U5      S-   5      5      :X  d
   SU 35       e[        U5      [        U5      :X  d    S	[        U5       S
[        U5       S35       eU
 Vs/ s H
  oUS-
     PM     nn/ n/ n[        [        U5      5       H  nUR                  Xg   5        U[        U5      S-
  :w  d  M*  [        U5      nX'   R                  S5      X'   R                  S5      -  S-  S-  nUR                  XU-   /5        UR                  S/U-  5        M     XUS.$ s  snf s  sn	f s  snf )Nr   )	input_idspixel_valuesimage_sizesz<\|image_\d+\|>r   |_zSimage_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be z?total images must be the same as the number of image tags, got z image tags and z imagesr   )add_prefix_instructionlenrB   r\   resplitrangefindallintsortedsetlistextendr   append)rJ   textinput_imagesmodel_inputspatternchunkprompt_chunksi
image_tagss	image_idsunique_image_idsr   all_input_idsimg_inx	start_inxr   s                    r   process_multi_modal_prompt5OmniGenMultiModalProcessor.process_multi_modal_promptV   sw   **403|#4#9..t4L!-!7!7^bcc$KM88T[KbcKb%,,U3==Kbcq#m,-A"a'#0#3AB#7  . ZZ.
BLM*QSa..s3B78*	M!#i.14a5E1F1J(K#LL 	
abrast	
L #$L(99 	
McRbNcMddtux  zF  vG  uH  HO  P	
9 6??YQU+Y?s=)*A  !12C&**.	#++B/,/2F2Fr2JJbPTVV	t+;<=$$aS4Z0 + +Y`aa= d N @s   $I9IIc                 .    SnSnSnSnU U U U U 3nU$ )Nz	<|user|>
z:Generate an image according to the following instructions
z<|assistant|>
<|diffusion|>z<|end|>
r   )rJ   promptuser_promptgeneration_promptassistant_promptprompt_suffixs         r   rc   1OmniGenMultiModalProcessor.add_prefix_instruction}   s;    "Y9#=!2 3F8M?K[J\]r1   Ninstructionsrp   heightwidthnegative_promptuse_img_cfgseparate_cfg_inputuse_input_image_size_as_outputnum_images_per_promptreturnc
                 D   [        U[        5      (       a  U/nU/n/ n
[        [        U5      5       GHP  nX   nUc  S OX+   nUb0  [        U5      S:  a!  U Vs/ s H  oR	                  U5      PM     nnO
S nSU;  d   eU R                  X5      nSu  nnU R                  US 5      nU(       aa  Ub\  [        U5      S:  aM  [        [        U5      5       Vs/ s H  nSUS-    S3PM     nnU R                  SR                  U5      U5      nOUn[        U	5       Hc  nU(       aC  U
R                  UUUUS   S   R                  S	5      US   S   R                  S
5      /45        MM  U
R                  UUUX4/45        Me     GMS     U R                  U
5      $ s  snf s  snf )Nr   z<img><|image_1|></img>)NNr   z<img><|image_z|></img> r]   rb   ra   )
rT   rU   rg   rd   rY   r}   joinrn   r   rI   )rJ   r   rp   r   r   r   r   r   r   r   
input_dataru   cur_instructioncur_input_imagesr   
mllm_inputneg_mllm_inputimg_cfg_mllm_inputimg_cfg_promptr`   s                       r   __call__#OmniGenMultiModalProcessor.__call__   s    lC(((>L(>L
s<()A*oO'3';t+4D0E0ICS#TCSa$6$6q$9CS #T #' /FFF88[J1;.N.!<<_dSN#/C8H4IQ4NOTUXYiUjOk%lOk!a!eWH&EOkN%l)-)H)HR`Iacs)t&)7&011%%&*.'7:??CZP^E_`aEbEgEghjEkl	 %%z>CUX^Wf&gh 2) *B }}Z((; $U &ms   FF)rI   rG   r*   rB   )   )Nr   r   aj  low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers.TFFr   )__name__
__module____qualname____firstlineno__ri   rK   rP   rY   r}   rc   r   rU   boolr   r   __static_attributes__r   r1   r   r3   r3   8   s    *s *
+
%bN )-  L #(/4%&2)3i2) 49o2) 	2)
 2) 2) 2) !2) )-2)  #2) 
2) 2)r1   r3   c                   B    \ rS rSrSS jrS rS rS rS rS r	S r
S	rg
)rH      c                     Xl         X l        g r   )pad_token_idhidden_size)rJ   r   r   s      r   rK   OmniGenCollator.__init__   s    (&r1   c                    / nUR                  S5      n[        U5      nU HL  n[        R                  " U5      nS/XG-
  -  [	        [        Xu-   S-   5      5      -   nUR                  U5        MN     [        R                  " U5      $ )Nra   r   r   )r   r#   torchsumrl   rg   rn   
LongTensor)	rJ   attention_masknum_tokens_for_output_imagesposition_idstext_length
img_lengthmasktemp_ltemp_positions	            r   create_positionOmniGenCollator.create_position   s    $))"-56
"DYYt_FC;#784f)A-.< M . # --r1   c                    / n/ nUR                  S5      n[        U5      nXV-   S-   nSnU GH  n	[        R                  " U	5      n
XZ-
  n[        R                  " [        R
                  " U
S-   U
S-   4S95      n[        R                  " U
S-   U4S9n[        R                  " X/SS9n[        R
                  " XjU-   S-   4S9n[        R                  " X/SS9nUS:  a]  [        R                  " U
S-   U-   U4S9n[        R                  " X/SS9n[        R
                  " X4S9n[        R                  " X/SS9nX(   nXo-
  nUS:  a.  SUSS2U* S24'   [        R                  " SUU R                  4S9nOSnUR                  UR                  S5      5        UR                  U5        US-  nGM     [        R                  " USS9U4$ )z
OmniGen applies causal attention to each element in the sequence, but applies bidirectional attention within
each image sequence References: [OmniGen](https://huggingface.co/papers/2409.11340)
ra   r   r   )r   )dimN)r   r#   r   r   triloneszeroscatr   rn   	unsqueeze)rJ   r   r   extended_maskpadding_imagesr   r   seq_leninxr   r   pad_l	temp_mask
image_maskpad_masktrue_img_lengthpad_img_lengthtemp_padding_imgss                     r   create_maskOmniGenCollator.create_mask   s   
 $))"-56
*Q."DYYt_F(E

5::FQJ
3K#LMI6A:z*BCJ		9"9rBI*z6IA6M)NOJ		9"9qAIqy ;;VaZ*-De,LM!IIx&;D	 ::E+;<!IIx&;C	:?O'9N!12	!n_--.$)KKaIYIY5Z$[!$(!  !4!4Q!78!!"341HC; #< yyA.>>r1   c                 d    UR                  5        H  nX#    H  u  pESX   XE2XE24'   M     M     U$ )Nr   )keys)rJ   r   r^   b_inxr|   end_inxs         r   !adjust_attention_for_input_images1OmniGenCollator.adjust_attention_for_input_images   sE     %%'E&1&8"	NO%i&79J&JK '9 ( r1   c           	      l   [        U Vs/ s H  n[        U5      PM     sn5      n/ n/ n[        [        U5      5       H  nX   n[        U5      n	XI-
  n
U
S:X  a'  UR                  S/U-  5        UR                  U5        O>UR                  S/U
-  S/U	-  -   5        UR                  U R                  /U
-  U-   5        Xr;   d  M  / nX'    H'  nUR                  U Vs/ s H  o3U
-   PM	     sn5        M)     XU'   M     [
        R                  " U5      [
        R                  " U5      U4$ s  snf s  snf )Nr   r   )r#   rd   rg   rn   r   r   r   )rJ   r\   r^   r   max_l
padded_idsr   ru   temp_idsr   r   new_inxold_inxs                r   pad_input_idsOmniGenCollator.pad_input_ids   s'   Y/YSVY/0
s9~&A |H]FNEz%%qcEk2!!(+%%qcEkQC&L&@A!!4#4#4"5"="HI*~GNNw#?w!Iw#?@  .!(A '" 
+U-=-=n-M{ZZ+ 0$ $@s   D,D1c                     / nU H#  nUR                  US   US   -  S-  S-  5        M%     / 0 peSnU HM  nUS   b?  UR                  US   5        US    H"  n	Xv;  a  U	/Xg'   M  Xg   R                  U	5        M$     US-  nMO     U Vs/ s H  oR                  S5      PM     nnU Vs/ s H  oS   PM	     n
nU R                  X5      u  pnU R	                  X5      nU R                  X5      u  pU R                  X5      nXXXV4$ s  snf s  snf )Nr   r   r   r]   r^   r\   )rn   rm   r   r   r   r   r   )rJ   mllm_inputstarget_img_sizer   img_sizer]   r^   r   r   r   r\   padded_input_idsr   r   r   s                  r   process_mllm_input"OmniGenCollator.process_mllm_input  s9   ')$'H(//hqk0IR0OSU0UV ( %'kA ,##An$56m,D/.2V*#*11$7	 -
 QJE  1==1A=-89[{^[	98<8J8J98b5+++NY)-)9)9.)g&??\~|hh >9s   D(Dc                 H   U Vs/ s H  o"S   PM	     nnU Vs/ s H  o"S   PM	     nnU Vs/ s H  o"S   PM	     nnU Vs/ s H  o"S   PM	     nnUS   b  X4-   U-   nXf-   U-   nOX4-   nXf-   nU R                  X65      u  nnn	n
nnUU	UUUS.nU$ s  snf s  snf s  snf s  snf )Nr   r   r
   r   )r\   r   r   input_pixel_valuesinput_image_sizes)r   )rJ   featuresfr   cfg_mllm_inputsr   r   all_padded_input_idsall_position_idsall_attention_maskall_padding_imagesall_pixel_valuesall_image_sizesdatas                 r   r   OmniGenCollator.__call__/  s    %-.XtX.)12AQ42,45HqdH5)12AQ42a ,%7:LLK-?/QO%7K-?O ##KA	
  .0,"2!0
 9 /252s   BBBB)r   r   N)r
   i   )r   r   r   r   rK   r   r   r   r   r   r   r   r   r1   r   rH   rH      s(    '
.)?V[0i4r1   rH   )re   typingr   r   numpyr%   r   PILr   utilsr   torchvisionr   r0   r3   rH   r   r1   r   <module>r      sE    
     - & 6) )DR Rr1   