
    cCih                         S r SSKrSSKJr  SSKJr  SSKJr  \R                  " \	5      r
/ SQr/ SQrS	/r/ S
QrS rS rS rS r\\\\S.r " S S\5      r " S S\5      r " S S\5      r/ SQrg)zJukebox configuration    N)Union   )PretrainedConfig)logging)O
block_attntranspose_block_attnprev_block_attnr   r   r	   r   r   r	   r   r   r	   r   r   r	   r   r   r	   cross_attentionr   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   )r   r   r	   dense_attention)
prime_attnr   
dense_attnc                     [         S   $ )Nr   )_FullDenseAttentionlayers    v/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.pyfull_dense_attentionr   q   s    q!!    c                     [         U S-     $ )N   )_RawColumnPreviousRowAttentionr   s    r   !raw_column_previous_row_attentionr   u   s    )%!)44r   c                     [         U S-     $ )NO   )_LARGE_ATTENTIONr   s    r    large_separated_enc_dec_w_lyricsr   y   s    EBJ''r   c                 D    U S-  S:X  a  [         U S-     $ [        U S-     $ )N      r   )_PrimePrimeDenseAttentionr   r   s    r   enc_dec_with_lyricsr!   }   s,    rzR(33)%!)44r   )r   r   r   r!   c            *          ^  \ rS rSrSrSrSSS.rSSS	S
SSSSSSSSSSSSSSSSSSSSS/SSSSS	SSSSSS/ SQ/ S QSS!SSS4*U 4S" jjr\S&S#\	\
\R                  4   4S$ jj5       rS%rU =r$ )'JukeboxPriorConfig   a  
    This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
    `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the top level prior from the
    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
-1b-lyrics) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.



Args:
    act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
        Activation function.
    alignment_head (`int`, *optional*, defaults to 2):
        Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
        alignment
    alignment_layer (`int`, *optional*, defaults to 68):
        Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
        lyric to audio alignment
    attention_multiplier (`float`, *optional*, defaults to 0.25):
        Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
        0.25*width of the model will be used.
    attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
        Which attention pattern to use for the decoder/
    attn_dropout (`int`, *optional*, defaults to 0):
        Dropout probability for the post-attention layer dropout in the decoder.
    attn_res_scale (`bool`, *optional*, defaults to `False`):
        Whether or not to scale the residuals in the attention conditioner block.
    blocks (`int`, *optional*, defaults to 64):
        Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
        blocks]` in the `JukeboxAttention` layer.
    conv_res_scale (`int`, *optional*):
        Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
        conditioner, the default value is to None and should not be modified.
    num_layers (`int`, *optional*, defaults to 72):
        Number of layers of the transformer architecture.
    emb_dropout (`int`, *optional*, defaults to 0):
        Embedding dropout used in the lyric decoder.
    encoder_config (`JukeboxPriorConfig`, *optional*) :
        Configuration of the encoder which models the prior on the lyrics.
    encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
        Multiplication factor used in front of the lyric encoder loss.
    hidden_size (`int`, *optional*, defaults to 2048):
        Hidden dimension of the attention layers.
    init_scale (`float`, *optional*, defaults to 0.2):
        Initialization scales for the prior modules.
    is_encoder_decoder (`bool`, *optional*, defaults to `True`):
        Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
        greater than 0, the `encoder` args should be specified for the lyric encoding.
    mask (`bool`, *optional*, defaults to `False`):
        Whether or not to mask the previous positions in the attention.
    max_duration (`int`, *optional*, defaults to 600):
        Maximum supported duration of the generated song in seconds.
    max_nb_genres (`int`, *optional*, defaults to 1):
        Maximum number of genres that can be used to condition the model.
    merged_decoder (`bool`, *optional*, defaults to `True`):
        Whether or not the decoder and the encoder inputs are merged. This is used for the separated
        encoder-decoder architecture
    metadata_conditioning (`bool`, *optional*, defaults to `True)`:
        Whether or not to condition on the artist and genre metadata.
    metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
        Number of genres and the number of artists that were used to train the embedding layers of the prior
        models.
    min_duration (`int`, *optional*, defaults to 0):
        Minimum duration of the generated audio on which the model was trained.
    mlp_multiplier (`float`, *optional*, defaults to 1.0):
        Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
        the model will be used.
    music_vocab_size (`int`, *optional*, defaults to 2048):
        Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
    n_ctx (`int`, *optional*, defaults to 6144):
        Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
        generating music tokens.
    n_heads (`int`, *optional*, defaults to 2):
            Number of attention heads.
    nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
        Number of lyric tokens that are used when sampling a single window of length `n_ctx`
    res_conv_depth (`int`, *optional*, defaults to 3):
        Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
        `JukeboxMusicTokenConditioner`.
    res_conv_width (`int`, *optional*, defaults to 128):
        Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
        `JukeboxMusicTokenConditioner`.
    res_convolution_multiplier (`int`, *optional*, defaults to 1):
        Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
    res_dilation_cycle (`int`, *optional*):
        Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
        corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
        tokens.
    res_dilation_growth_rate (`int`, *optional*, defaults to 1):
        Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
    res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
        Downsampling rates used in the audio conditioning network
    res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
        Striding used in the audio conditioning network
    resid_dropout (`int`, *optional*, defaults to 0):
        Residual dropout used in the attention pattern.
    sampling_rate (`int`, *optional*, defaults to 44100):
        Sampling rate used for training.
    spread (`int`, *optional*):
        Spread used in the `summary_spread_attention` pattern
    timing_dims (`int`, *optional*, defaults to 64):
        Dimension of the timing embedding.
    zero_out (`bool`, *optional*, defaults to `False`):
        Whether or not to zero out convolution weights when initializing.
jukebox_priorn_positionsn_head)max_position_embeddingsnum_attention_heads
quick_gelur      D   g      ?r!   F@   NH   g?   皙?TP   iX     i\  i  g      ?i   i  r      r   r+   r+   r+   r+   r+   D  c+                 ~  > [         T,U ]  " S0 U+D6  Xl        X0l        X@l        XPl        X`l        Xpl        Xl        Xl	        Xl
        Xl        Xl        UU l        Ub  [        S0 UD6U l        OS U l        Xl        UU l        UU l        UU l        X l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l         U U l!        U!U l"        U"U l#        U#U l$        U$U l%        U%U l&        U&U l'        U'U l(        U(U l)        U)U l*        Xl+        U*U l,        g N )-super__init__act_fnalignment_headalignment_layerattention_multiplierattention_patternattn_dropoutattn_res_scaleblocksconv_res_scale
num_layersemb_dropoutmusic_vocab_sizer#   encoder_configencoder_loss_fraction
init_scaleis_encoder_decoderlyric_vocab_sizelevelmaskmax_durationmax_nb_genresmerged_decodermetadata_conditioningmetadata_dimsmin_durationmlp_multipliern_ctxn_headsnb_relevant_lyric_tokensres_conv_depthres_conv_widthres_convolution_multiplierres_dilation_cycleres_dilation_growth_rateres_downs_tres_strides_tresid_dropoutsampling_ratespreadtiming_dimshidden_sizezero_out)-selfr<   rM   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rH   rI   rd   rJ   rK   rL   rN   rO   rP   rQ   rR   rS   rT   rU   rG   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   re   kwargs	__class__s-                                               r   r;   JukeboxPriorConfig.__init__   sV   \ 	"6",.$8!!2(,,$& 0%"4"F~"FD"&D%:"$"4 0
	(*,%:"*(,
(@%,,*D'"4(@%&***&& r   pretrained_model_name_or_pathc                 T   U R                  U5        U R                  " U40 UD6u  pCUR                  S5      S:X  a  USU 3   nSU;   aM  [        U S5      (       a<  US   U R                  :w  a)  [
        R                  SUS    SU R                   S35        U R                  " U40 UD6$ )N
model_typejukeboxprior_You are using a model of type   to instantiate a model of type N. This is not supported for all configurations of models and can yield errors._set_token_in_kwargsget_config_dictgethasattrrl   loggerwarning	from_dict)clsrj   rM   rg   config_dicts        r   from_pretrained"JukeboxPriorConfig.from_pretrained\  s      (!112OZSYZ ??<(I5%ug&67K;&73+E+E+VbJcgjguguJuNN0\1J0KKk>>""pr
 }}[3F33r   )*r<   r=   r>   r?   r@   rA   rB   rC   rD   rF   rH   rI   rd   rJ   rK   rM   rL   rN   rO   rP   rQ   rR   rS   rT   rU   rG   rV   rW   rX   rE   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   re   )r   )__name__
__module____qualname____firstlineno____doc__rl   attribute_mapr;   classmethodr   strosPathLiker|   __static_attributes____classcell__rh   s   @r   r#   r#      s    kZ !J#0'M !/!"Dk!$#$!"W[!z 4E#r{{BR<S 4 4r   r#   c                      ^  \ rS rSrSrSrSSSSSS	/ S
QSS/ SQSSSSS/ SQ/ SQSSS4U 4S jjr\S\\	\
R                  4   4S j5       rSrU =r$ )JukeboxVQVAEConfigio  a6  
This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
`JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the VQVAE from
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    act_fn (`str`, *optional*, defaults to `"relu"`):
        Activation function of the model.
    nb_discrete_codes (`int`, *optional*, defaults to 2048):
        Number of codes of the VQVAE.
    commit (`float`, *optional*, defaults to 0.02):
        Commit loss multiplier.
    conv_input_shape (`int`, *optional*, defaults to 1):
        Number of audio channels.
    conv_res_scale (`bool`, *optional*, defaults to `False`):
        Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
    embed_dim (`int`, *optional*, defaults to 64):
        Embedding dimension of the codebook vectors.
    hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
        Fraction of non-intersecting window used when continuing the sampling process.
    levels (`int`, *optional*, defaults to 3):
        Number of hierarchical levels that used in the VQVAE.
    lmu (`float`, *optional*, defaults to 0.99):
        Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
        of the original [VQVAE paper](https://huggingface.co/papers/1711.00937v2.pdf)
    multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
        Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
    res_conv_depth (`int`, *optional*, defaults to 4):
        Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
    res_conv_width (`int`, *optional*, defaults to 32):
        Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
    res_convolution_multiplier (`int`, *optional*, defaults to 1):
        Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
    res_dilation_cycle (`int`, *optional*):
        Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
        reduced by a power of `res_dilation_cycle`.
    res_dilation_growth_rate (`int`, *optional*, defaults to 3):
        Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
    res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
        Downsampling rate for each level of the hierarchical VQ-VAE.
    res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
        Stride used for each level of the hierarchical VQ-VAE.
    sample_length (`int`, *optional*, defaults to 1058304):
        Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
    init_scale (`float`, *optional*, defaults to 0.2):
        Initialization scale.
    zero_out (`bool`, *optional*, defaults to `False`):
        Whether or not to zero out convolution weights when initializing.
jukebox_vqvaerelur/   g{Gz?r2   Fr-   )g      ?      ?r   r   gGz?)r+   r2   r2   r       Nr4   r5   i & r0   c                    > [         TU ]  " S0 UD6  Xpl        X@l        UU l        Xl        X`l        X l        Xl        Xl	        Xl
        Xl        Xl        Xl        UU l        UU l        Xl        X0l        XPl        Xl        UU l        UU l        g r8   )r:   r;   hop_fractionconv_input_shapesample_lengthlevels	embed_dimnb_discrete_codesrZ   rY   r[   r]   r\   multipliersr^   r_   lmucommitrD   r<   rJ   re   )rf   r<   r   r   r   rD   r   r   r   r   r   rY   rZ   r[   r\   r]   r^   r_   r   rJ   re   rg   rh   s                         r   r;   JukeboxVQVAEConfig.__init__  s    0 	"6"( 0* "!2,,*D'(@%"4&&*,$ r   rj   c                 N   U R                  U5        U R                  " U40 UD6u  p2UR                  S5      S:X  a  US   nSU;   aM  [        U S5      (       a<  US   U R                  :w  a)  [
        R                  SUS    SU R                   S35        U R                  " U40 UD6$ )Nrl   rm   vqvae_configro   rp   rq   rr   )rz   rj   rg   r{   s       r   r|   "JukeboxVQVAEConfig.from_pretrained  s      (!112OZSYZ ??<(I5%n5K;&73+E+E+VbJcgjguguJuNN0\1J0KKk>>""pr
 }}[3F33r   )r<   r   r   rD   r   r   rJ   r   r   r   r   rY   rZ   r[   r\   r]   r^   r_   r   re   )r~   r   r   r   r   rl   r;   r   r   r   r   r   r|   r   r   r   s   @r   r   r   o  s}    4l !J &#$!"+.!` 4E#r{{BR<S 4 4r   r   c                   t   ^  \ rS rSrSrSr         S
U 4S jjr\S\\	   S\
4S j5       rU 4S jrS	rU =r$ )JukeboxConfigi  a
  
This is the configuration class to store the configuration of a [`JukeboxModel`].

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
yield a similar configuration to that of
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.


The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
(5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
to get the second level codes. This is mostly true for training the top level prior and the upsamplers.

Args:
    vqvae_config (`JukeboxVQVAEConfig`, *optional*):
        Configuration for the `JukeboxVQVAE` model.
    prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
        List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
    nb_priors (`int`, *optional*, defaults to 3):
        Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
        (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
        trained using a top prior and 2 upsampler priors.
    sampling_rate (`int`, *optional*, defaults to 44100):
        Sampling rate of the raw audio.
    timing_dims (`int`, *optional*, defaults to 64):
        Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
        layer. The timing embedding layer converts the absolute and relative position in the currently sampled
        audio to a tensor of length `timing_dims` that will be added to the music tokens.
    min_duration (`int`, *optional*, defaults to 0):
        Minimum duration of the audios to generate
    max_duration (`float`, *optional*, defaults to 600.0):
        Maximum duration of the audios to generate
    max_nb_genres (`int`, *optional*, defaults to 5):
        Maximum number of genres that can be used to condition a single sample.
    metadata_conditioning (`bool`, *optional*, defaults to `True`):
        Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
        duration.

Example:

```python
>>> from transformers import JukeboxModel, JukeboxConfig

>>> # Initializing a Jukebox configuration
>>> configuration = JukeboxConfig()

>>> # Initializing a model from the configuration
>>> model = JukeboxModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
rm   c
                 8  > Uc  0 n[         R                  S5        [        S0 UD6U l        Ub"  U Vs/ s H  n[	        S0 UD6PM     snU l        Op/ U l        [        U5       HZ  nU
R                  SU 3S 5      nUc  0 n[         R                  SU S35        U R
                  R                  [	        S0 UD65        M\     U R                  R                  U l	        X0l
        Xl        X@l        XPl        X`l        Xpl        Xl        ["        TU ]H  " S0 U
D6  g s  snf )NzHvqvae_config is None. initializing the JukeboxVQVAE with default values.rn   zQ's  config is None. Initializing the JukeboxPriorConfig list with default values.r9   )rw   infor   r   r#   prior_configsrangepopappendr   	nb_priorsrP   ra   rc   rT   rO   rR   r:   r;   )rf   r   prior_config_listr   ra   rc   rT   rO   rP   rR   rg   prior_config	prior_idxrh   s                r   r;   JukeboxConfig.__init__$  s!    LKKbc.>>(Yj!kYj"4"D|"DYj!kD!#D"9-	%zzF9+*>E'#%LKK  ,# # ""))*<*L|*LM . !--::" +*&((%:""6"3 "ls   Dr   r   c                 z    U Vs/ s H  oDR                  5       PM     nnU " SXRR                  5       S.UD6$ s  snf )z
Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model
configuration.

Returns:
    [`JukeboxConfig`]: An instance of a configuration object
)r   vqvae_config_dictr9   )to_dict)rz   r   r   rg   configr   s         r   from_configsJukeboxConfig.from_configsR  sC     =JJM&^^-MJk%6J^J^J`kdjkk Ks   8c                    > [         TU ]  5       nUR                  S5       Vs/ s H  o"R                  5       PM     snUS'   U$ s  snf )Nr   r   )r:   r   r   )rf   resultr   rh   s      r   r   JukeboxConfig.to_dict^  sE    "FLjjQ`Fa&bFaF~~'7Fa&b"# 'cs   A)
r   rO   rP   rR   rT   r   r   ra   rc   r   )	NNr   r6   r-   r   g     @   T)r~   r   r   r   r   rl   r;   r   listr#   r   r   r   r   r   r   s   @r   r   r     sj    4l J ",#\ 	l.@)A 	lQc 	l 	l r   r   )r   r#   r   )r   r   typingr   configuration_utilsr   utilsr   
get_loggerr~   rw   r   r   r   r    r   r   r   r!   ATTENTION_PATTERNSr#   r   r   __all__r9   r   r   <module>r      s     	  4  
		H	%P b "[ () F "5(5 1)J(H.	 a4) a4Hy4) y4xw$ wt Hr   