
    +h7                     ~    S SK Jr  S SKrS SKrS SKJr  S SKJrJr  S SK	J
r
  SSKJrJr  SSKJr   " S	 S
\\\
5      rg)    )OptionalN)nn)
GPT2ConfigGPT2LMHeadModel)ModuleUtilsMixin   )ConfigMixinregister_to_config)
ModelMixinc            (       
  ^  \ rS rSrSrSS/r\                 S*S\S\S\\   S\S	\S
\S\S\S\\   S\	S\
S\
S\
S\
S\
S\S\S\S\4&U 4S jjj5       r  S+S\R                  S\R                  S\\R                     S\\R                     4S jjrS\S\R                   S \R                  4S! jrS" r\R&                  " 5       S# 5       r\R&                  " 5              S,S$\S%\S&\
S'\\   4S( jj5       rS)rU =r$ )-UniDiffuserTextDecoder   a  
Text decoder model for a image-text [UniDiffuser](https://huggingface.co/papers/2303.06555) model. This is used to
generate text from the UniDiffuser image-text embedding.

Parameters:
    prefix_length (`int`):
        Max number of prefix tokens that will be supplied to the model.
    prefix_inner_dim (`int`):
        The hidden size of the incoming prefix embeddings. For UniDiffuser, this would be the hidden dim of the
        CLIP text encoder.
    prefix_hidden_dim (`int`, *optional*):
        Hidden dim of the MLP if we encode the prefix.
    vocab_size (`int`, *optional*, defaults to 50257):
        Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
    n_positions (`int`, *optional*, defaults to 1024):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    n_embd (`int`, *optional*, defaults to 768):
        Dimensionality of the embeddings and hidden states.
    n_layer (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    n_head (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    n_inner (`int`, *optional*, defaults to None):
        Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
    activation_function (`str`, *optional*, defaults to `"gelu"`):
        Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
    resid_pdrop (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    embd_pdrop (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the embeddings.
    attn_pdrop (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the attention.
    layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
        The epsilon to use in the layer normalization layers.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    scale_attn_weights (`bool`, *optional*, defaults to `True`):
        Scale attention weights by dividing by sqrt(hidden_size)..
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
    scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
        Whether to additionally scale attention weights by `1 / layer_idx + 1`.
    reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
        Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
        dot-product/softmax to float() when training with mixed precision.
zh\.\d+\.attn\.biaszh\.\d+\.attn\.masked_biasprefix_lengthprefix_inner_dimprefix_hidden_dim
vocab_sizen_positionsn_embdn_layern_headn_inneractivation_functionresid_pdrop
embd_pdrop
attn_pdroplayer_norm_epsiloninitializer_rangescale_attn_weights	use_cachescale_attn_by_inverse_layer_idxreorder_and_upcast_attnc                 @  > [         TU ]  5         Xl        X&:w  a  Uc  [        SU SU S35      eX l        X0l        U R
                  b+  [        R                  " U R                  U R
                  5      O[        R                  " 5       U l	        U R
                  b!  [        R                  " U R
                  U5      O[        R                  " 5       U l
        [        S0 SU_SU_SU_SU_SU_S	U	_S
U
_SU_SU_SU_SU_SU_SU_SU_SU_SU_6n[        U5      U l        g )Nz>`prefix_hidden_dim` cannot be `None` when `prefix_inner_dim`: z and `n_embd`: z are not equal.r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!    )super__init__r   
ValueErrorr   r   r   LinearIdentityencode_prefixdecode_prefixr   r   transformer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   
gpt_config	__class__s                        o/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/unidiffuser/modeling_text_decoder.pyr%   UniDiffuserTextDecoder.__init__B   s   . 	*%*;*CPQbPc d$X_6 
 !1!2 %%1 IId++T-C-CD 	 :>9O9O9[BIId,,f5acalalan 	   
!
#
 
 	

 
 
 !4
 $
 "
 "
  2
 0
  2
  
 -L
  %<!

$ +:6    	input_idsprefix_embedsattention_masklabelsc                    U R                   R                   R                  U5      nU R                  U5      nU R                  U5      n[        R
                  " X%4SS9nUb?  U R                  UR                  S   UR                  5      n[        R
                  " X4SS9nU R                  XtUS9n	U R                  b  X4$ U	$ )a  
Args:
    input_ids (`torch.Tensor` of shape `(N, max_seq_len)`):
        Text tokens to use for inference.
    prefix_embeds (`torch.Tensor` of shape `(N, prefix_length, 768)`):
        Prefix embedding to prepend to the embedded tokens.
    attention_mask (`torch.Tensor` of shape `(N, prefix_length + max_seq_len, 768)`, *optional*):
        Attention mask for the prefix embedding.
    labels (`torch.Tensor`, *optional*):
        Labels to use for language modeling.
   dimr   )inputs_embedsr5   r4   )
r+   wter)   r*   torchcatget_dummy_tokenshapedevicer   )
r,   r2   r3   r4   r5   embedding_texthiddenembedding_catdummy_tokenouts
             r/   forwardUniDiffuserTextDecoder.forward   s    $ ))5599)D##M2**62		="AqI..yq/A9CSCSTKYY7Q?F]Zhi!!-;Jr1   
batch_sizer@   returnc                 ^    [         R                  " XR                  [         R                  US9$ )N)dtyper@   )r<   zerosr   int64)r,   rH   r@   s      r/   r>   &UniDiffuserTextDecoder.get_dummy_token   s     {{:'9'9U[\\r1   c                 $    U R                  U5      $ )N)r)   )r,   prefixs     r/   encodeUniDiffuserTextDecoder.encode   s    !!&))r1   c                 Z   [         R                  " USSS9n/ n/ nU H]  nU R                  UR                  U5      5      nU R	                  XcUS9u  pxUR                  US   5        UR                  US   5        M_     [         R                  " U5      n[         R                  " U5      nXE4$ )a  
Generate captions given text embedding features. Returns list[L].

Args:
    features (`torch.Tensor` of shape `(B, L, D)`):
        Text embedding features to generate captions from.
    eos_token_id (`int`):
        The token ID of the EOS token for the text decoder model.
    device:
        Device to perform text generation on.

Returns:
    `List[str]`: A list of strings generated from the decoder model.
r7   r   r8   )input_embedsr@   eos_token_id)r<   splitr*   togenerate_beamappendstack)	r,   featuresrU   r@   generated_tokensgenerated_seq_lengthsfeatureoutput_tokensseq_lengthss	            r/   generate_captions(UniDiffuserTextDecoder.generate_captions   s    " ;;x2 "G((F);<G)-););$, *< *&M ##M!$45!((Q8   !;;'78 %,A B66r1   	beam_sizeentry_lengthtemperaturerU   c                    UnSn	Sn
[         R                  " XC[         R                  S9n[         R                  " XC[         R                  S9nUb  UnO%U R
                  R
                  R                  U5      n[        U5       GHb  nU R                  US9nUR                  nUSS2SSS24   US:  a  UOS-  nUR                  S5      R                  5       nU
c  UR                  US5      u  n
nUR                  " U/UR                  SS Q76 nUR                  SS5      U
R                  S5      n
nU	c  Un	GO
U	R                  " U/U	R                  SS Q76 n	[         R                   " U	U4SS9n	O[#        [$        R&                  5      * UU'   SUUS4'   U
SS2S4   U-   nX) ==   S-  ss'   UUSS2S4   -  nUR)                  S5      R                  US5      u  nnUUR                  S   -  nUU   nUUR                  S   -  nUR+                  S5      nU	U   n	[         R                   " U	U4SS9n	UU   nUU-  n
UU   nU R
                  R
                  R                  UR                  5       5      R)                  UR                  S   SS5      n[         R                   " UU4SS9nUUR-                  U5      R                  5       -   nUR/                  5       (       d  GMc    O   X-  n
U
R1                  S	S
9nU Vs/ s H  oU   PM	     nn[         R2                  " USS9n[         R4                  " U Vs/ s H  oU   PM	     snUR6                  S9nUU4$ s  snf s  snf )a  
Generates text using the given tokenizer and text prompt or token embedding via beam search. This
implementation is based on the beam search implementation from the [original UniDiffuser
code](https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py#L89).

Args:
    eos_token_id (`int`, *optional*):
        The token ID of the EOS token for the text decoder model.
    input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
        Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds`
        must be supplied.
    input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
        An embedded representation to directly pass to the transformer as a prefix for beam search. One of
        `input_ids` and `input_embeds` must be supplied.
    device:
        The device to perform beam search on.
    beam_size (`int`, *optional*, defaults to `5`):
        The number of best states to store during beam search.
    entry_length (`int`, *optional*, defaults to `67`):
        The number of iterations to run beam search.
    temperature (`float`, *optional*, defaults to 1.0):
        The temperature to use when performing the softmax over logits from the decoding model.

Returns:
    `Tuple(torch.Tensor, torch.Tensor)`: A tuple of tensors where the first element is a tensor of generated
    token sequences sorted by score in descending order, and the second element is the sequence lengths
    corresponding to those sequences.
N)r@   rK   )r:   r         ?r7   r8   T)
descending)rK   )r<   onesintrL   boolr+   r;   rangelogitssoftmaxlogtopkexpandr?   permutesqueezer=   floatnpinfview	unsqueezeeqallargsortrZ   tensorrK   )r,   r2   rT   r@   rc   rd   re   rU   stop_token_indextokensscoresr`   
is_stopped	generatedioutputsrn   next_tokens
scores_sumscores_sum_averagenext_tokens_sourcenext_token_embedorderoutput_textss                           r/   rX   $UniDiffuserTextDecoder.generate_beam   s`   P (jjK[[L
#$I((4488CI|$A&&Y&?G^^FAr1H%aSQF^^B'++-F~&,kk)R&@#%,,YM9LM	&1&9&9!Q&?PQARV>(F#]]9Hv||AB7GHF"YY'<!DF&+BFFm^z"()z1}%#AtG_v5
K(A-(%/+ag2F%F"2D2I2I"2M2R2RS\^`2a/"K%0J4D4DQ4G%G")*<=)J,<,<Q,??)33A6 23FK#8a@%&89	+k9'(:;
#//;;??@S@S@UV[[\e\k\klm\npqsuv		9.>"?QGI#knn5E&F&N&N&PPJ~~I %L %$/+015aq	51{{<Q7llE#BEqNE#B+J[J[\[(( 2#Bs   M:M!)r*   r)   r   r   r   r+   )NiQ  i   i      r   Ngelu_new皙?r   r   gh㈵>g{Gz?TTFF)NN)NNN   C   rh   N)__name__
__module____qualname____firstlineno____doc__"_keys_to_ignore_on_load_unexpectedr
   rk   r   strru   rl   r%   r<   TensorrF   r@   r>   rQ   no_gradra   rX   __static_attributes____classcell__)r.   s   @r/   r   r      s   /b +@A])^&
 ,0!%#- $(#'#'05(-)>7>7 >7 $C=	>7
 >7 >7 >7 >7 >7 #>7 !>7 >7 >7 >7 ">7  !!>7" !#>7$ %>7& *.'>7( "&)>7 >7H 26)-<< || !.	
 &@]# ]u|| ] ]* ]]_7 7> ]]_  &*^)
 ^) ^) ^) sm^) ^)r1   r   )typingr   numpyrv   r<   r   transformersr   r   transformers.modeling_utilsr   configuration_utilsr	   r
   modelsr   r   r#   r1   r/   <module>r      s0        4 8 B  Z)Z6F Z)r1   