
    +h                         S SK Jr  S SKJr  S SKJr  S SKrS SKJr  S SK	rSSK
JrJr  SSKJr  SSKJrJr  \R$                  " \5      r " S	 S
\R*                  5      r\ " S S\5      5       r " S S\R*                  5      r " S S\\5      rg)    )	dataclass)pi)OptionalN   )ConfigMixinregister_to_config)
ModelMixin)
BaseOutputloggingc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	StableAudioPositionalEmbedding   zUsed for continuous timedimc                    > [         TU ]  5         US-  S:X  d   eUS-  n[        R                  " [        R
                  " U5      5      U l        g )N   r   )super__init__nn	Parametertorchrandnweights)selfr   half_dim	__class__s      p/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/stable_audio/modeling_stable_audio.pyr   'StableAudioPositionalEmbedding.__init__"   s@    aA~~!8||EKK$9:    timesreturnc                     US   nXR                   S    -  S-  [        -  n[        R                  " UR	                  5       UR                  5       4SS9n[        R                  " X4SS9nU$ )N).Nr   )r   )r   r   r   catsincos)r   r   freqs	fouriereds       r   forward&StableAudioPositionalEmbedding.forward(   sa    i T**Q.3IIuyy{EIIK8bA	IIu0b9	r   )r   )__name__
__module____qualname____firstlineno____doc__intr   r   Tensorr(   __static_attributes____classcell__r   s   @r   r   r      s2    ";C ;U\\ ell  r   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Srg) StableAudioProjectionModelOutput0   a  
Args:
Class for StableAudio projection layer's outputs.
    text_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states obtained by linearly projecting the hidden-states for the text encoder.
    seconds_start_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
        Sequence of hidden-states obtained by linearly projecting the audio start hidden states.
    seconds_end_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
        Sequence of hidden-states obtained by linearly projecting the audio end hidden states.
Ntext_hidden_statesseconds_start_hidden_statesseconds_end_hidden_states )r*   r+   r,   r-   r.   r7   r   r   r0   __annotations__r8   r9   r1   r:   r   r   r5   r5   0   sE    	 26.5:>%,,!7>8<x5<r   r5   c                   b   ^  \ rS rSrSr SS\\   4U 4S jjjrS\R                  4S jr
SrU =r$ )	StableAudioNumberConditionerB   a  
A simple linear projection model to map numbers to a latent space.

Args:
    number_embedding_dim (`int`):
        Dimensionality of the number embeddings.
    min_value (`int`):
        The minimum value of the seconds number conditioning modules.
    max_value (`int`):
        The maximum value of the seconds number conditioning modules
    internal_dim (`int`):
        Dimensionality of the intermediate number hidden states.
internal_dimc                    > [         TU ]  5         [        R                  " [	        U5      [        R
                  " US-   US95      U l        Xl        X l        X0l	        g )N   )in_featuresout_features)
r   r   r   
Sequentialr   Lineartime_positional_embeddingnumber_embedding_dim	min_value	max_value)r   rG   rH   rI   r?   r   s        r   r   %StableAudioNumberConditioner.__init__Q   sO     	)+*<8II,"2AUV*
&
 %9!""r   floatsc                 z   UR                  U R                  U R                  5      nXR                  -
  U R                  U R                  -
  -  n[        U R                  R                  5       5      R                  nUR                  U5      nU R	                  U5      nUR                  SSU R                  5      nU$ )Nr"   rA   )
clamprH   rI   nextrF   
parametersdtypetoviewrG   )r   rK   normalized_floatsembedder_dtype	embeddingfloat_embedss         r   r(   $StableAudioNumberConditioner.forwardb   s     dnndnn=#nn4$..9XY d<<GGIJPP-00@223DE	 ~~b!T-F-FGr   )rI   rH   rG   rF   )   )r*   r+   r,   r-   r.   r   r/   r   r   r0   r(   r1   r2   r3   s   @r   r=   r=   B   s;    & '*#
 sm# #" r   r=   c                      ^  \ rS rSrSr\U 4S j5       r   S	S\\R                     S\\R                     S\\R                     4S jjr
SrU =r$ )
StableAudioProjectionModelt   a  
A simple linear projection model to map the conditioning values to a shared latent space.

Args:
    text_encoder_dim (`int`):
        Dimensionality of the text embeddings from the text encoder (T5).
    conditioning_dim (`int`):
        Dimensionality of the output conditioning tensors.
    min_value (`int`):
        The minimum value of the seconds number conditioning modules.
    max_value (`int`):
        The maximum value of the seconds number conditioning modules
c                    > [         TU ]  5         X!:X  a  [        R                  " 5       O[        R                  " X5      U l        [        X#U5      U l        [        X#U5      U l        g )N)	r   r   r   IdentityrE   text_projectionr=   start_number_conditionerend_number_conditioner)r   text_encoder_dimconditioning_dimrH   rI   r   s        r   r   #StableAudioProjectionModel.__init__   sU    -ABKKMryyQaGt 	 )EEUbk(l%&BCS`i&j#r   r7   start_secondsend_secondsc                     Uc  UOU R                  U5      nUc  UOU R                  U5      nUc  UOU R                  U5      n[        UUUS9$ )N)r7   r8   r9   )r^   r_   r`   r5   )r   r7   rd   re   r8   r9   s         r   r(   "StableAudioProjectionModel.forward   sn     #5"<$BVBVWiBj 	 +2M8U8UVc8d 	$ 4?3FKDLgLghsLt!/1(C&?
 	
r   )r`   r_   r^   )NNN)r*   r+   r,   r-   r.   r   r   r   r   r0   r(   r1   r2   r3   s   @r   rZ   rZ   t   sj     k k 6:04.2	
$U\\2
  -
 ell+	
 
r   rZ   )dataclassesr   mathr   typingr   r   torch.nnr   torch.utils.checkpointconfiguration_utilsr   r   models.modeling_utilsr	   utilsr
   r   
get_loggerr*   loggerModuler   r5   r=   rZ   r:   r   r   <module>rs      s~    "      B / ( 
		H	%RYY " =z = ="/299 /d*
[ *
r   