
    cCiE                       S r SSKrSSKrSSKJr  SSKrSSKrSSKJ	s  J
r  SSKJ	r	  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJrJr  \R:                  " \5      rSS\ " S5      * 4S jr!S r"S r#S r$S r%S r& " S S\	RN                  5      r( " S S\	RN                  5      r) " S S\	RN                  5      r* " S S\	RN                  5      r+ " S S\	RN                  5      r, " S S \	RN                  5      r- " S! S"\	RN                  5      r. " S# S$\	RN                  5      r/ " S% S&\	RN                  5      r0S'r1\" S(\15       " S) S*\5      5       r2 " S+ S,\	RN                  5      r3 " S- S.\5      r4 " S/ S0\	RN                  5      r5 " S1 S2\	RN                  5      r6 " S3 S4\	RN                  5      r7 " S5 S6\	RN                  5      r8 " S7 S8\	RN                  5      r9 " S9 S:\	RN                  5      r: " S; S<\	RN                  5      r; " S= S>\	RN                  5      r< " S? S@\5      r= " SA SB\5      r>SCr?\" SD\15       " SE SF\>5      5       r@/ SGQrAg)HzPyTorch Jukebox model.    N)Optional)nn)	LayerNorm   )ACT2FN)PreTrainedModel)add_start_docstringslogging)tqdm   )ATTENTION_PATTERNSJukeboxConfigJukeboxPriorConfigJukeboxVQVAEConfig        Infc                    U R                  5       n [        XR                  S5      5      nUS:  a&  U [        R                  " XSS9S   SSS24   :  nX0U'   US:  a  [        R
                  " U SSS9u  pV[        R                  " [        R                  " USS9SS9nXr:  nUSSS24   R                  5       USS	S24'   SUS
'   [        R                  " U [        R                  S9R                  SXhS9nX0U'   U $ )a  
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering

Args:
    logits (`torch.Tensor`):
        logits distribution shape (vocabulary size)
    top_k (`int`, *optional*, defaults to 0):
        When `top_k >0` keep only top key tokens with highest probability (top-k filtering).
    top_p (`int`, *optional*, defaults to 0):
        When `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
r   dim.Nr   T)
descendingr   r   ).r   dtype)r   indexsrc)cloneminsizetorchtopksortcumsumFsoftmax
zeros_likeboolscatter_)	logitstop_ktop_pfilter_valueindices_to_removesorted_logitssorted_indicescumulative_probssorted_indices_to_removes	            q/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.pyfilter_logitsr2   %   s    \\^F{{2'Eqy"UZZ2%Fq%I#rs(%SS$0 !s{(-

6dPR(S% <<		-R(HbQ $4#; ,DS#2#X,N,T,T,V ab)+, ( ",,V5::FOO. P 
 %1 !M    c           	      T   U S   n [        U 5      U:  a  [        R                  " [        R                  " U[        U 5      -
  [        R                  S9R                  U R                  5      U /5      nS/U[        U 5      -
  -  [        [        S[        U 5      5      5      -   nOt[        [        U 5      X4S-  -   -  U-  5      n[        [        XqS-  5      [        U 5      US-  -
  5      nXUS-  -
  XqS-  -    n[        [        XqS-  -
  XqS-  -   5      5      nUR                  SS9U4$ )a  
Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
returned. If the provided token sequence is smaller, it will be padded, otherwise, only characters ranging from the
midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
the most relevant tokens (in time) for the sequence.

Args:
    full_tokens (`list[int]`):
        List containing the token ids of the entire lyrics.
    total_length (`int`):
        Total expected length of the music (not all of it is generated, see duration), in samples.
    offset (`int`):
        Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
        account
    duration (`int`):
        Expected duration of the generated music, in samples. The duration has to be smaller than the total length,
        which represent the overall length of the signal,
r   r   r          @   r   )lenr   catzeroslongtodevicelistrangeintr   max	unsqueeze)full_tokensmax_n_lyric_tokenstotal_lengthoffsetdurationtokensindicesmidpoints           r1   get_relevant_lyric_tokensrJ   K   s/   & a.K
;,,[[+c+.>>ejjQTTU`UgUghjuv
 $,s;/??@4aQTU`QaHbCccs;'6sN+BClRSs81%<=s;?ORdhiRi?ij(:a(??(cdMdBdeuXa(??bcLcAcde"G++r3   c                 v    / n[        SX-
  U-   U5       H   nXA-   U :  a  X-
  nUR                  U5        M"     U$ Nr   )r>   append)rD   n_ctx
hop_lengthstartsstarts        r1   
get_startsrR   m   sH    Fq,.;ZH=L( (Ee	 I
 Mr3   c           
         UR                   S-
  nUR                  nX   nUR                  S   UR                  S   pX:  aV  XX-
  n	[        R                  " U[        R
                  " XuU-
  UR                  UR                  S9/SS9nUR                  S   nOSn	[        UR                  U* S-
     UR                  -  5      n
UR                  S   UR                  S   pU1n0 n0 n[        [        XU
5      SS9 H  nUU-   nUR                  UUUR                  SSS9u  nn[        R                   " XgSS9n[        R                   " UUSS9n/ n[#        UU5       H=  u  nnUR%                  US S 2UU24   / UUS	9nUR'                  US   S S 2U4   5        AM?     [        R                  " USS9nAUR)                  S
[        R*                  S9R-                  5       nAUUU'   UUU'   M     / n[/        U5       H  nUSSS 24   n[0        R
                  " U[3        U5      S-   45      n[5        [        XU
5      5       H!  nUU-   nUU   U   nUU   U   n UUUU2U 4'   M#     US X-
  2S S24   nUR'                  U5        M     U$ )Nr   r   r   r<   r   z#Computing lyric to music alignment )descT)get_indicesrE   )get_attn_weightscpur<   r      r   )levelsrN   shaper   r8   r9   r   r<   r?   hop_fractionprior_alignment_headprior_alignment_layerr   rR   get_metadatasample_lengthchunkzipforward_tokensrM   r;   floatnumpyr>   npr7   reversed)!music_tokenslabelspriorconfiglevelrN   rG   
batch_sizerD   padding_lengthrO   alignment_headalignment_layerattn_layersalignment_hopsindices_hopsrQ   endmetadataindices_hop	tokens_bsmetadata_bsw_hopstokens_i
metadata_iw_hopweightsalignment_hop
alignmentsitemrB   	alignmentrH   s!                                    r1   get_alignmentr   w   s   LL1EKKE F%||AQ-U[[\-A^d^k^klmst
 ||AV((%!4u{{BCJ&,&A&A!&DfFbFbcdFeO"#KNLjjAHmnem % 2 265&BVBVdhqr 2 s+KK:	kk(JA>$'	;$? Hj((!U3Y,)?Zbm(nEMM%(1n#456 %@ ))F*

%u{{
CIIK *U -u' o. Jj!QUmHHlC,<q,@AB	jjIJE%-C*51$7M"5)$/G,9IeCi()	 K
 = ==ssBC	)$ " r3   c                    [         R                  " USS5      R                  5       R                  5       n[	        [        UR                  S   5      5       Hw  nUbP  [	        U5      U   R                  5       u  pVnU  SU SU SU SUS S  SU 3n[        R                  " XU   5        MV  [        R                  " U  SU SU 3X4   5        My     g )Nr   r   r   z/lvl_-   z-sample-)
r   clamprX   rf   r=   r>   r\   valuesrg   save)	fnamelvlmetasaudiartistsgenreslyricspaths	            r1   save_temp_audior      s    
++c2q
!
%
%
'
-
-
/C%		!%&&*5k!n&;&;&=#GVWE#ay&6"1:,asKDGGDa&!GGugU3%xs3SV< 'r3   c                    U b  US:X  a  g U(       a  Xq-
  O[        X!-
  S5      nU S:X  a%  [        R                  " XUS9R                  U5      n OU S:X  a  [        R                  " XUS9R                  5       n [        R                  " XUS9R                  5       n U R	                  XX-  5      S S 2S S2U* U-  S 24   n [        R
                  R                  R                  U SSS9R                  5       R	                  X5      n O*U S	:X  a$  [        R                  " XUS9R                  U5      n U R	                  SSX5      $ )
Nr   r   autoregressiver<   summaryr   r   r   r   r   )valueprime)	r@   r   onestrilviewr   
functionalpad
contiguous)	maskquery_lengthkey_value_lengthblocksspreadr<   samplesample_trE   s	            r1   get_maskr      s,   ||q((.X$C8H8WYZ4[Fzz,HMMfU		zz,VDIIKzz,VDIIKyy|/EFq#2#P`O`djOjOlGlmHH## $ 
 Z\T,1 	 
zz,HMMfU99Q<::r3   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JukeboxConv1D   c                    > [         TU ]  5         Xl        X l        [        R
                  " X5      n[        R                  " U5      n[        R                  " U5      U l	        [        R                  " U5      U l
        g N)super__init__input_widthoutput_widthr   emptyr9   r   	Parameterweightbias)selfr   r   r   r   	__class__s        r1   r   JukeboxConv1D.__init__   sV    &([7{{<(ll6*LL&	r3   c           	      :   / UR                  5       S S QU R                  P7n[        R                  " U R                  R                  U5      UR                  SUR                  S5      5      U R                  R                  U5      5      nUR                  " U6 nU$ )Nr   )r   r   r   addmmr   type_asr   r   )r   hidden_statessize_outs      r1   forwardJukeboxConv1D.forward   s    B]'')#2.B0A0ABIIm,r=#5#5b#9:KK.

 &**H5r3   )r   r   r   r   __name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r1   r   r      s    ' r3   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )JukeboxResConv1DBlock   c                   > [         TU ]  5         UR                  U-  nUR                  U-  nUnX@l        [
        R                  " 5       U l        [
        R                  " X%SSXv5      U l	        [
        R                  " XRSSS5      U l
        g )NrZ   r   r   )r   r   res_convolution_multiplierres_dilation_growth_rate	res_scaler   ReLU
activationConv1dconv1d_1conv1d_2)	r   rl   
conv_widthdepthr   
hidden_dimdilationpaddingr   s	           r1   r   JukeboxResConv1DBlock.__init__   sq    66C
22E9"'')		*!QR		*!QBr3   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nX R                  U-  -   $ r   r   r   r   r   )r   r   	residualss      r1   r   JukeboxResConv1DBlock.forward   sQ    !	6m46m4>>M999r3   r   )r         ?r   r   s   @r1   r   r      s    	C: :r3   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )JukeboxResnet1D   c           	        > [         T	U ]  5         UR                  U l        UR                  (       d  SOS[
        R                  " U5      -  n/ n[        U5       H;  nU R                  c  UOXpR                  -  nUR                  [        XX5      5        M=     U(       a  US S S2   n[        R                  " U5      U l        g )Nr   r   )r   r   res_dilation_cycledilation_cycleconv_res_scalemathsqrtr>   rM   r   r   
ModuleListresnet_block)
r   rl   r   n_depthreverse_dilationr   r   r   block_depthr   s
            r1   r   JukeboxResnet1D.__init__  s    $77%44C#		'@R:R	7^E#'#6#6#>%EL_L_D_KMM/K[\ $ DbD\FMM&1r3   c                 <    U R                    H  nU" U5      nM     U$ r   )r   r   r   blocks      r1   r   JukeboxResnet1D.forward  s"    &&E!-0M 'r3   )r   r   Fr   r   s   @r1   r   r      s    2 r3   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JukeboxEncoderConvBlocki  c           
        > [         TU ]  5         / nUS-  nUS-  n	US:  a\  [        U5       HM  n
UR                  [        R
                  " U
S:X  a  UOUX8Xi5      5        UR                  [        XU5      5        MO     [        R
                  " X1R                  SSS5      U l        [        R                  " U5      U l
        g )Nr6   r   rZ   r   )r   r   r>   rM   r   r   r   	embed_dimproj_outr   downsample_block)r   rl   r   r   r   down_tstride_tr   filter_tpad_tr   r   s              r1   r    JukeboxEncoderConvBlock.__init__  s    a<AA:6]biiQ!V	Zckstof%HI # 		*.>.>1aH "f 5r3   c                 ^    U R                    H  nU" U5      nM     U R                  U5      nU$ r   r   r   r   s      r1   r   JukeboxEncoderConvBlock.forward"  s0    **E!-0M +m4r3   r   r   r   s   @r1   r   r     s    
6 r3   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JukeboxEncoderi)  c                 N  > [         TU ]  5         X@l        [        R                  " 5       U l        [        [        [        U R                  5      5      XV5      nU HH  u  pn
U R
                  R                  [        XS:X  a  UR                  OUR                  X#X5      5        MJ     g rL   )r   r   r[   r   r   level_blocksrc   r=   r>   rM   r   conv_input_shaper   )r   rl   widthr   r[   downs_t	strides_titeratorr   r   r   r   s              r1   r   JukeboxEncoder.__init__*  s    MMOtE$++./D#+Ax$$'AvF336CSCSUZci $,r3   c                     / n[        U R                  5       H+  nU R                  U   nU" U5      nUR                  U5        M-     U$ r   )r>   r[   r  rM   )r   r   all_hidden_statesrm   level_blocks        r1   r   JukeboxEncoder.forward7  sM     4;;'E++E2K'6M$$]3 (
 ! r3   r  r[   r   r   s   @r1   r  r  )  s    	! 	!r3   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )JukeboxDecoderConvBockiC  c           
        > X l         X0l        [        TU ]  5         / nUS:  a  US-  n	US-  n
[        R
                  " X#SSS5      U l        [        U5       HP  nUR                  [        XXG5      5        UR                  [        R                  " X;US-
  :  a  UOUXU
5      5        MR     [        R                  " U5      U l        g )Nr   r6   rZ   r   )r   r   r   r   r   r   proj_inr>   rM   r   ConvTranspose1dr   upsample_block)r   rl   r   r   r   r   r   r   r   r   r   r   r   s               r1   r   JukeboxDecoderConvBock.__init__D  s    "$A:!|HME99YAq!DDL6]of%Z[&&"fqj.JiQYej # !mmF3r3   c                 ^    U R                  U5      nU R                   H  nU" U5      nM     U$ r   )r  r  r   s      r1   r   JukeboxDecoderConvBock.forwardV  s0    ]3((E!-0M )r3   )r   r   r  r  Tr   r   s   @r1   r  r  C  s    4$ r3   r  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )JukeboxDecoderi]  c                   > [         T
U ]  5         X@l        [        R                  " 5       U l        [        [        [        U R                  5      5      XV5       H6  u  pxn	U R
                  R                  [        XR                  X#X5      5        M8     [        R                  " UR                  UR                  SSS5      U l        g )NrZ   r   )r   r   r[   r   r   r  rc   r=   r>   rM   r  r   r   r  out)r   rl   r   r   r[   r  r  rm   r   r   r   s             r1   r   JukeboxDecoder.__init__^  s    MMO'*4dkk0B+CW'X#E8$$&v/?/?TZe (Y
 99V--v/F/F1aPr3   c                     US   n[        [        U R                  5      5       H5  nU R                  U   nU" U5      nUS:w  d  M"  U(       d  M+  X1US-
     -   nM7     U R	                  U5      nU$ )Nr   r   r   )rh   r>   r[   r  r  )r   r   
all_levelshidden_staterm   r  s         r1   r   JukeboxDecoder.forwardi  sr    $R( eDKK01E++E2K&|4Lzjj+EAI.FF 2 xx-r3   )r  r[   r  r  r   r   s   @r1   r  r  ]  s    	Q r3   r  c                   p   ^  \ rS rSrS\4U 4S jjrS rS rS rS r	S r
S	 rS
 rS rS rSS jrSrU =r$ )JukeboxBottleneckBlockix  rl   c                 6  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        SU l        SU l        S U l	        S U l
        U R                  S[        R                  " U R                  U R                  5      5        g )Nr   Fcodebook)r   r   nb_discrete_codesr   codebook_widthlmumu	thresholdinitcodebook_sumcodebook_elemregister_bufferr   r9   )r   rl   r   s     r1   r   JukeboxBottleneckBlock.__init__y  sz    !'!9!9$..**	 !ZT5K5KTM`M`)abr3   c                     UR                   u  p#X R                  :  a\  U R                  U-   S-
  U-  nS[        R                  " U5      -  nUR	                  US5      nU[
        R                  " U5      U-  -   nU$ )Nr   {Gz?)r\   r'  rg   r   repeatr   
randn_like)r   r   r   embed_width	n_repeatsstds         r1   _tileJukeboxBottleneckBlock._tile  s{    (..'''//#59cAI--C)00A>M)E,<,<],Kc,QQMr3   c                 $   U R                   nSU l        U R                  U5      nU[        R                  " UR
                  S   5         S U U l        U R                  U l        [        R                  " X R                  R                  S9U l
        g )NTr   r   )r'  r,  r8  r   randpermr\   r&  r-  r   r<   r.  )r   r   r'  codess       r1   init_codebook$JukeboxBottleneckBlock.init_codebook  sq     22	

=)ennU[[^<=>P?PQ MM"ZZ(9--BVBVWr3   c           	      n   U R                   U R                  U R                  pTn[        R                  " 5          [        R
                  " XQR                  S   UR                  S9nUR                  SUR                  SUR                  S   5      S5        [        R                  " Xa5      nUR                  SS9nU R                  U5      n	U	[        R                  " U	R                  S   5         S U n
U R                  nX0R                  -  SU-
  U-  -   U l        X0R                   -  SU-
  U-  -   U l        U R                   R                  US5      U R"                  :  R%                  5       nU R                  R                  XT5      U R                   R                  US5      -  nX-  SU-
  U
-  -   U l        U[        R                  " U5      -  n[        R                  " U[        R&                  " US-   5      -  5      * nXR"                  :  R                  5       n[        R                  " U5      n[        R(                  R+                  U R                  U-
  5      [,        R.                  " [,        R0                  " UR                  5      5      -  nS S S 5        WWWWS.$ ! , (       d  f       N= f)	Nr   r   r   r   r   r   g:0yE>)entropy	used_currusagedk)r*  r(  r'  r   no_gradr9   r\   r<   r'   r   matmulsumr8  r;  r&  r-  r.  r+  re   loglinalgnormrg   r   prod)r   r   latent_statesr*  r(  r'  latent_states_onehot_codebook_sum_codebook_elemr<  _random_codebookold_codebookrB  	norm_code_codebook_probr@  rA  rC  s                     r1   update_codebook&JukeboxBottleneckBlock.update_codebook  sH   049L9LdNdNd-]]_ $);;/@BUBUVWBXanauau#v  ))!]-?-?=CVCVWXCY-Z\]^!LL)=MM155"5=NJJ}-E$U^^EKKN%CDEWFWX  ==L "%6%6 6#(m9S SD!#&8&8!8C"H;V!VD'',,->BdnnT[[]E))../@QTXTfTfTkTk!1U I "/1u9@P2PPDM+eii.GGNyy%))NT<Q2R!RSSG'>>9>>@IIIe$E""4==<#?@277277S_SeSeKfCggB3 4 #URTUU5 _s   IJ&&
J4c                    UR                  SSS5      R                  5       nUR                  SUR                  S   5      nUR                  S   U R                  :X  an  [
        R                  R                  U[
        R                  " U5      -
  5      [        R                  " [        R                  " UR                  5      5      -  nX4$ UR                  S   SU R                  -  :X  Ga   USS U R                  24   USU R                  S 24   pC[
        R                  R                  U[
        R                  " U5      -
  5      [        R                  " [        R                  " UR                  5      5      -  [
        R                  R                  U[
        R                  " U5      -
  5      [        R                  " [        R                  " UR                  5      5      -  -   nX4-   nUW4$ )Nr   r6   r   r   .)permuter   r   r\   r(  r   rH  rI  meanrg   r   rJ  )r   r   prenormx1x2s        r1   
preprocess!JukeboxBottleneckBlock.preprocess  s   %--aA6AAC%**2}/B/B2/FGr"d&9&99ll''

=8Q(QRUWU\U\++,V G %%   $D,?,?(??"3(=$*=*=(=#=>cSWSfSfShNh@i||((ejjn)<=PRPXPXHY@ZZ!!"uzz"~"56AR9SSG
 GMg%%r3   c                     Uu  pEUR                  XES5      R                  SSS5      R                  5       nUR                  XE5      nX4$ )Nr   r   r6   r   )r   rV  r   )r   rK  dequantised_statesx_shapern   times         r1   postprocess"JukeboxBottleneckBlock.postprocess  sQ    "
/44ZrJRRSTVWYZ[ffh%**:<00r3   c                 0   U R                   R                  5       n[        R                  " US-  SSS9S[        R                  " X5      -  -
  [        R                  " US-  SSS9-   n[        R
                  " USS9u  pE[        R                  " U5      nXV4$ )Nr6   r   Tr   keepdimr   r   )r&  tr   rF  rE  r   rW  )r   rK  codebook_weightsdistancemin_distanceri   fits          r1   quantiseJukeboxBottleneckBlock.quantise  s    ==??,IImQ&B=%,,}??@ii(!+DAB 	
 &+YYxR%@"jj&  r3   c                 F    [         R                  " XR                  5      nU$ r   )r#   	embeddingr&  )r   ri   r^  s      r1   
dequantise!JukeboxBottleneckBlock.dequantise  s    [[}}E!!r3   c                     UR                   u  p#nU R                  U5      u  pU R                  U5      u  pSUR                  X$5      nU$ r   )r\   r[  rk  r   )r   rK  samples_seq_lenri   s         r1   encodeJukeboxBottleneckBlock.encode  sO    +11G  ??=9 --6 $((:r3   c                     UR                   u  p#U R                  U5      nUR                  X#U R                  5      R	                  SSS5      R                  5       nU$ Nr   r6   r   )r\   ro  r   r(  rV  r   )r   ri   rr  rt  r^  s        r1   decodeJukeboxBottleneckBlock.decode  sa    '-- "__\: ##Gd6I6IJRRSTVWYZ[ffh 	 "!r3   c           	      2   UR                   u  p4nU R                  U5      u  pU(       a"  U R                  (       d  U R                  U5        U R	                  U5      u  pxU R                  U5      n	U(       a  U R                  X5      n
O0 n
[        R                  R                  U	R                  5       U-
  5      S-  [        R                  " UR                   5      -  nXU-
  R                  5       -   n	U R                  XyX545      u  pyXyU[        SXS.U
D64$ )Nr6   )rj  pn )r\   r[  r,  r=  rk  ro  rS  r   rH  rI  detachrg   rJ  ra  dict)r   r   rS  rr  rs  rt  rX  ri   rj  r^  update_metricscommit_losss               r1   r   JukeboxBottleneckBlock.forward  s   +11G "&!? 499}- !MM-8!__\: !11-NNN ll''(:(A(A(Cm(STXYY\^\c\c]
 

 +=.P-X-X-ZZ ,0+;+;L_f^p+q(d>is>iZh>iiir3   )r&  r.  r-  r(  r,  r*  r'  r+  r  )r   r   r   r   r   r   r8  r=  rS  r[  ra  rk  ro  ru  ry  r   r   r   r   s   @r1   r$  r$  x  sM    	c1 	cXV<&&1
!"
"j jr3   r$  c                   >   ^  \ rS rSrU 4S jrS rSS jrS rSrU =r	$ )JukeboxBottlenecki  c                    > [         TU ]  5         X l        [        R                  " 5       U l        [        U R                  5       H'  nU R
                  R                  [        U5      5        M)     g r   )	r   r   r[   r   r   r  r>   rM   r$  )r   rl   r[   rm   r   s       r1   r   JukeboxBottleneck.__init__  sN    MMO4;;'E$$%;F%CD (r3   c                     [        U R                  U5       VVs/ s H  u  p#UR                  U5      PM     nnnU$ s  snnf r   )rc   r  ru  )r   	raw_audior  r   ri   s        r1   ru  JukeboxBottleneck.encode  sI    RUVZVgVgirRs
Rs2N;K}-Rs 	 
 
s   <c                     Uc  U R                   n[        U R                  X# U5       VVs/ s H  u  pEUR                  U5      PM     nnnU$ s  snnf r   )r[   rc   r  ry  )r   ri   start_level	end_levelr  zquantised_audios          r1   ry  JukeboxBottleneck.decode$  s]    I:=d>O>OP[>fht:u
:u&6{Kq!:u 	 
 
s   Ac                    / / / / 4u  p#pE[        U R                  5       H  nU R                  U* S-
     nX   nU" XR                  S9u  ppUR	                  U	5        U R                  (       d  U
R                  5       n
UR	                  U
5        UR	                  U5        U R                  (       d  M  UR	                  U5        M     X#XE4$ )Nr   )rS  )r>   r[   r  trainingrM   r~  )r   input_audiori   quantised_statescommit_lossesmetricsrm   r  r   sampled_tokensquantised_stater  metrics                r1   r   JukeboxBottleneck.forward,  s    ACRR>4;;'E++UFQJ7K'.MCN}}D@N[ /== #2"8"8":##O4  -}}}v& ( }EEr3   r  r   N)
r   r   r   r   r   ru  ry  r   r   r   r   s   @r1   r  r    s    EF Fr3   r  a?  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
zThe Hierarchical VQ-VAE model used in Jukebox. This model follows the Hierarchical VQVAE paper from [Will Williams, Sam
Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://huggingface.co/papers/2002.08111).

    c                      ^  \ rS rSr% \\S'   SrS rS\4U 4S jjrSS jr	SS\
R                  4S jjrSS	 jrSS
 jrS rS\
R                   S\\
R                  \
R                  4   4S jrSrU =r$ )JukeboxVQVAEiQ  rl   vqvaec                 j   [        U[        R                  5      (       a=  UR                  R                  R                  SSU R                  R                  -  S9  GO[        U[        5      (       a|  U R                  R                  (       a%  UR                  R                  R                  5         OUR                  R                  R                  SSU R                  R                  -  S9  O[        U[        5      (       aw  U R                  R                  (       a\  UR                  R                  R                  R                  5         UR                  R                  R                  R                  5         [        U[        R                  5      (       aI  UR                  R                  R                  5         UR                  R                  R                  S5        [        U[        R                   5      (       a3  UR                  b%  UR                  R                  R                  5         g g g )Nr   {Gz?rW  r7  r   )
isinstancer   	Embeddingr   datanormal_rl   
init_scaler   zero_outzero_r   r   r   r   fill_Linearr   modules     r1   _init_weightsJukeboxVQVAE._init_weights\  se   fbll++MM&&CTDKK<R<R5R&S..{{##""((*""**@V@V9V*W 5664;;;O;OOO""''--/OO  %%++-fbll++KK""$MM$$S)fbii((V[[-DKK""$ .E(r3   c                   > [         TU ]  U5        UR                  nUR                  nUR                  (       d  [        X25       VVs/ s H	  u  pEXE-  PM     nnn[        R                  " U5      nUR                  UR                  -  U-  U-  Ul        UR                  R                  [        5      Ul        UR                  U l        UR                  U l        UR                  U l        [        X25       VVs/ s H	  u  pEXE-  PM     snnU l        [        R                  " U R                  5      U l        UR"                  =U l        n[%        U5       V	s/ s H,  n	[        U R                  U R                   U	* S-
     -  5      PM.     sn	U l        UR(                  b  UR(                  OS/U-  U l        [*        R,                  " 5       U l        [*        R,                  " 5       U l        [%        U5       H  n	UR2                  U R(                  U	   -  n
UR4                  U R(                  U	   -  nU R.                  R7                  [9        XXS-   US U	S-    US U	S-    5      5        U R0                  R7                  [;        XXS-   US U	S-    US U	S-    5      5        M     [=        X5      U l        g s  snnf s  snnf s  sn	f Nr   ) r   r   res_downs_tres_strides_tra   rc   rg   rJ  sample_length_in_secondssampling_rateastyper?   r'  commitdownsamplescumprodhop_lengthsr[   r>   music_tokens_shapesmultipliersr   r   encodersdecodersres_conv_widthres_conv_depthrM   r  r  r  
bottleneck)r   rl   r  r  stridedownr  top_raw_to_tokensr[   rm   r  r   r   s               r1   r   JukeboxVQVAE.__init__m  sz    $$((	##<?	<ST<SLF6<<SKT " 4//&2F2FFJ[[!$"F  $*#7#7#>#>s#CF !'!9!9mm#11=@=TU=T\VFL=TU::d&6&67%}},fSXY_S`$
S`%S##t'7'7
'CCDS`$
  281C1C1O6--VWUX[aUa6]E))D,<,<U,CCE))D,<,<U,CCEMM  veQY%RS)@TV_`kbgjkbkVlm MM  veQY%RS)@TV_`kbgjkbkVlm # ,F;A U V$
s   J2:J83J>c                     Uc  U R                   nU R                  R                  XUS9nU R                  U   USS peU" USS9nUR	                  SSS5      nU$ )Nr  r  r   r   Fr   r6   )r[   r  ry  r  rV  )r   ri   r  r  rK  decoderdequantised_states          r1   _decodeJukeboxVQVAE._decode  sm    I..|`i.j%)]];%?qQRAS"#$5%H-55aA>  r3   returnc           	         U Vs/ s H  n[         R                  " XTSS9PM     nn/ n[        U5       H8  nU V	s/ s H  oU   PM	     n
n	U R                  XUS9nUR	                  U5        M:     [         R
                  " USS9$ s  snf s  sn	f )a]  
Transforms the input `music_tokens` to their `raw_audio` representation.

Args:
    music_tokens (`torch.LongTensor`):
        Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token
        should be an index to a corresponding `code` vector in the codebook.
    start_level (`int`, *optional*):
        Level at which the decoding process will start. Default to 0.
    end_level (`int`, *optional*):
        Level at which the decoding process will start. Default to None.
    bs_chunks (int, *optional*):
        Number of chunks to process at the same time.
r   r   r  )r   rb   r>   r  rM   r8   )r   ri   r  r  	bs_chunkstokentoken_chunksr^  r   chunksmusic_tokens_ir  s               r1   ry  JukeboxVQVAE.decode  s     KWW,E!<,Wy!A6BClFQilNC $^`i j%%&78 " yy+33 X Ds
   BB	c                 2   Uc  U R                   nUR                  SSS5      R                  5       n/ n[        U R                   5       H.  nU R                  U   nU" U5      nUR                  US   5        M0     U R                  R                  U5      n	XU $ )Nr   r6   r   r   )r[   rV  re   r>   r  rM   r  ru  )
r   r  r  r  r  rK  rm   encoderlatent_stateri   s
             r1   _encodeJukeboxVQVAE._encode  s    I''1a06684;;'EmmE*G";/L  b!12 ( --m<	22r3   c                     [         R                  " XSS9n/ nU H$  nU R                  XrUS9nUR                  U5        M&     [	        U6  V	s/ s H  n	[         R
                  " U	SS9PM     n
n	U
$ s  sn	f )a  
Transforms the `input_audio` to a discrete representation made out of `music_tokens`.

Args:
    input_audio (`torch.Tensor`):
        Raw audio which will be encoded to its discrete representation using the codebook. The closest `code`
        form the codebook will be computed for each sequence of samples.
    start_level (`int`, *optional*, defaults to 0):
        Level at which the encoding process will start. Default to 0.
    end_level (`int`, *optional*):
        Level at which the encoding process will start. Default to None.
    bs_chunks (int, *optional*, defaults to 1):
        Number of chunks of raw audio to process at the same time.
r   r   r  )r   rb   r  rM   rc   r8   )r   r  r  r  r  audio_chunksmusic_tokens_listchunk_ir  music_tokens_levelri   s              r1   ru  JukeboxVQVAE.encode  s~     {{;qA#G!\\'V_\`N$$^4 $ X[\mWnoWnAS		"4!<Wno ps   A/c           
          U R                    Vs/ s H(  n[        R                  " SU R                  U/UQ7SS9PM*     nnU R	                  U5      $ s  snf )Nr   rX   )r   r<   )r  r   randintr'  ry  )r   	n_samplesmusic_tokens_shaperi   s       r1   r   JukeboxVQVAE.sample  s`     '+&>&>
&>" MM!T339:ZGY:Zchi&> 	 
 {{<((	
s   /Ar  c                    UR                  SSS5      R                  5       n/ n[        U R                  5       H.  nU R                  U   nU" U5      nUR                  US   5        M0     U R                  U5      u  pxp/ n
[        U R                  5       H@  nU R                  U   nU" XUS-    SS9nU
R                  UR                  SSS5      5        MB     [        U	5      nU R                  U-  nX4$ )a  
Forward pass of the VQ-VAE, encodes the `raw_audio` to latent states, which are then decoded for each level.
The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is
computed.

Args:
    raw_audio (`torch.FloatTensor`):
        Audio input which will be encoded and decoded.

Returns:
    `tuple[torch.Tensor, torch.Tensor]`


Example:
```python
>>> from transformers import JukeboxVQVAE, set_seed
>>> import torch

>>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
>>> set_seed(0)
>>> zs = [torch.randint(100, (4, 1))]
>>> model.decode(zs).shape
torch.Size([4, 8, 1])
```
r   r6   r   r   Fr  )
rV  re   r>   r[   r  rM   r  r  rF  r  )r   r  r  rK  rm   r  r  rs  ri   r  r^  r  r  r  losss                  r1   r   JukeboxVQVAE.forward  s    8  ''1a06684;;'EmmE*G";/L  b!12 (
 -1OOM,J)4;;'EmmE*G 'UQY(GTY Z%%&7&?&?1a&HI (
 -({{[(!''r3   )r  r  r  r  r  r  r[   r  r  r'  ra   r  r   Nr   )r   r   r   r   r   __annotations__base_model_prefixr  r   r  r   Tensorry  r  ru  r   FloatTensortupler   r   r   r   s   @r1   r  r  Q  s{     %"%<1 %<N	!4RWR^R^ 4.3.)-(!2!2 -(uU\\5<<=W7X -( -(r3   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
JukeboxMLPi  c                 $  > [         TU ]  5         UR                  n[        UR                  U-  5      n[        X#5      U l        [        X25      U l        [        UR                     U l
        [        R                  " UR                  5      U l        g r   )r   r   hidden_sizer?   mlp_multiplierr   c_fcc_projr   act_fnactr   Dropoutresid_dropoutdropout)r   rl   r   r   r   s       r1   r   JukeboxMLP.__init__  sl    &&	..:;
!)8	#J:&--(zz&"6"67r3   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r  )r   r   s     r1   r   JukeboxMLP.forward  s@    		-0/M2]3r3   )r  r  r  r  r   r   s   @r1   r  r    s    	8 r3   r  c                   8   ^  \ rS rSrSU 4S jjrU 4S jrSrU =r$ )JukeboxLayerNormi%  c                    > [         TU ]  XUS9  [        R                  " U5      U l        SU R                  -  U l        g )N)epselementwise_affinei  )r   r   rg   rJ  r  	max_numel)r   normalized_shaper  r  r   s       r1   r   JukeboxLayerNorm.__init__&  s8    )GYZWW-.
+r3   c                   > UR                  5       U R                  :  aP  [        R                  " XR                  U R
                  U R                  U R                  5      R                  U5      $ [        TU ])  U5      R                  U5      $ r   )numelr   r#   
layer_normr  r   r   r  r   r   r   )r   inputr   s     r1   r   JukeboxLayerNorm.forward+  se    ;;=4>>)<<'<'<dkk499VZV^V^_gghmnn7?5)11%88r3   )r   r  )gh㈵>Tr   r   s   @r1   r  r  %  s    ,
9 9r3   r  c                      ^  \ rS rSrSU 4S jjrS rS rSS jrS rS r	S r
S	 rS
 rS rS rSS jrSS jrSS jrSS jr\S 5       rS rSS jrS rS rSS jrS rS rSrU =r$ )JukeboxAttentioni2  c           	      D  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        [        UR                  U R                  -  5      nXAR                  -  U l	        X l
        X@l        U R                  S-  U l        UR                  U l        US:X  a:  [        U R                  U5      U l        [        U R                  US-  5      U l        O[        U R                  US-  5      U l        [        X@R                  5      U l        [$        R&                  " UR
                  5      U l        [$        R&                  " UR(                  5      U l        X0l        US:X  a  U R,                  U l        O)US:X  a  U R0                  U l        OU R2                  U l        U R4                  S4U R6                  S4U R8                  S4U R:                  S 4U R<                  S4U R>                  S4U R4                  S 4U R@                  S4S	.nXS   u  U l!        U l"        URF                  U l#        URH                  U l$        U RF                  b  U R                  U RF                  -  U l%        S
U l&        0 U l'        URP                  U l)        SU l*        g )Ng      пcross_attentionr6   rZ   
prime_attnr   r   r   )
dense_attn
block_attntranspose_block_attnprev_block_attnsummary_attnsummary_spread_attnr  r  r   F)+r   r   r  r   n_headsattn_dropoutr  r?   attention_multiplierhead_dimrN   r   scaler   r   c_attnc_enc_kvr  r   r  r  	attn_func
decode_qkvqkv	prime_qkvfactored_qkvr  r  r  r  r  r  r  attn	attn_maskr   r   	block_ctxr   cachenb_relevant_lyric_tokensencoder_lenrecord_attn)r   rl   rN   r  r   ATTENTION_MAPr   s         r1   r   JukeboxAttention.__init__3  s   ++~~**44t~~EF
"nn4
$]]E)
KK	))'
CDK)$..*q.IDM'
QGDK#J?JJv':':;ZZ(<(<= #))DH,&~~DH((DH  ??,<=??,<=%)%>%>@P$Q $ 4 4d;!..	:$($<$<i#H $6??G4	
 %2$<!	4>mmmm;;"!ZZ4;;6DN
!:: r3   c           
      T   U R                   nU R                  (       a  [        R                  " X-  X%-  5      nO)[        R                  " X5      nUR	                  XU-  5        UR
                  nUR                  5       nU R                  (       ar  [        U R                  UR                  S5      UR                  S5      U R                  U R                  UR                  UU R                  5      nUb  Xh-  SSU-
  -  -   n[        R                   " USS9R#                  U5      n	U R$                  (       aJ  Xl        U R(                  S:X  a4  U R&                  S S 2S S 2U R*                  S 2S U R*                  24   U l        U R-                  U	5      n	[        R                  " X5      n
U
$ )Nr   g    er   r   r  )r  r  r   rE  mul_r   re   r   r   r   r   r   r   r<   r   r#   r$   typer%  attention_probr  r$  r  )r   query_states
key_statesvalue_statesr   r  attention_weightattn_weight_typer   r,  context_statess              r1   _attnJukeboxAttention._attni  se   

==$||L,@*BTU$||LE!!%-0+11+11399 !!"%# ''	D #3#:TQX=N#N #3<AABRS"0~~-&*&9&9!Q@P@P@RTfVZVfVfTf:f&g#**>:nCr3   c                     UR                  SSSS5      R                  5       n/ UR                  5       S S QUR                  S5      UR                  S5      -  P7nUR                  " U6 $ )Nr   r6   r   rZ   r)  r   )rV  r   r   r   )r   r   new_hidden_states_shapes      r1   merge_headsJukeboxAttention.merge_heads  sq    %--aAq9DDF"oM$6$6$8"$="o}?Q?QRT?UXeXjXjkmXn?n"o!!#:;;r3   c                     / UR                  5       S S QU R                  PUR                  S5      U R                  -  P7nUR                  " U6 nU(       a  UR                  SSSS5      $ UR                  SSSS5      $ )Nr   r   r6   rZ   r   )r   r  r   rV  )r   r   is_keyr6  s       r1   split_headsJukeboxAttention.split_heads  s    #
!#2&#
LL#
 r"dll2#

 &**,CD ((Aq!44 ((Aq!44r3   c                     U R                  U5      nU R                  USS9nU R                  U5      nU R                  XX45      nU R                  U5      nU$ )NT)r:  )r;  r3  r7  )r   querykeyr   r   r2  s         r1   r  JukeboxAttention.dense_attn  s]      's40  'E>)).9r3   c                    U R                   nUR                  u  pgnU(       a#  U R                  XX45      R                  USU5      $ UR                  S   n	UR                  Xi-  U-  XX5      nX:  a6  U	nUS S 2U* S 24   R	                  5       nUS S 2U* S 24   R	                  5       nUR                  Xg-  U-  XX5      nUR                  Xg-  U-  XX5      nU R                  XX45      R                  XgU5      $ r  )r!  r\   r  r   r   )
r   r>  r?  r   r   r!  rn   rt  r   r   s
             r1   r  JukeboxAttention.block_attn  s    NN	).&
Y??5u=BB:qR[\\ ;;q>LJJz8IEy\E%&!gXY,'224a'l+668((:/9<iSCJJz3y@)WE??5u=BB:Xabbr3   c                 t   U R                   nUR                  u  pgnU(       aI  US-
  U-  n	US S 2U	S U2S S 24   nUS S 2U	S U2S S 24   nU R                  XX45      R                  USU5      $ UR                  S   n
UR                  XjU-  XX5      nUR	                  SS5      R                  5       nUR                  Xe-  X-  U5      nUR                  XgU-  XX5      nUR	                  SS5      R                  5       nUR                  Xe-  Xu-  U5      nUR                  XgU-  XX5      nUR	                  SS5      R                  5       nUR                  Xe-  Xu-  U5      nU R                  XX45      nUR                  XeX-  U5      nUR	                  SS5      R                  5       nUR                  XjU5      nU$ )Nr   r6   )r!  r\   r  r   	transposer   )r   r>  r?  r   r   r!  rn   rt  r   	block_lenr   r  s               r1   r  %JukeboxAttention.transpose_block_attn  s   NN	).&
Y 1	1Ia-I-q01C!Y1	1145E??5u=BB:qR[\\ ;;q>LJJz9+Di[EOOAq)446EJJz5|7PR[\E((:)';YRC--1%002C((:173GSCJJzi+?VEOOAq)446EJJz5w7KYWEUCJ#@Y[deJ#--a3>>@J#9MJr3   c                    U R                   nUR                  u  pgnU(       a  US-
  U-  n	U	S-
  U-  n
U	S:  a!  US S 2XU-   2S S 24   nUS S 2XU-   2S S 24   nOT[        R                  " XeXR                  UR
                  S9n[        R                  " XeXR                  UR
                  S9nU R                  XX45      R                  USU5      $ UR                  S   nUR                  Xk-  U-  XX5      nUR                  XgU-  XX5      S S 2S S2S S 2S S 24   n[        R                  R                  R                  US5      nUR                  Xg-  U-  XX5      nUR                  XgU-  XX5      S S 2S S2S S 2S S 24   n[        R                  R                  R                  US5      nUR                  Xg-  U-  XX5      nX:  a  X-  nXu-  nUnUR                  XmXX5      S S 2U* S 24   nUR                  5       R                  Xl-  XX5      nUR                  XmXX5      S S 2U* S 24   nUR                  5       R                  Xl-  XX5      nU R                  XX45      R                  XgU5      $ )Nr   r   rY   r   r   r   r   r   r   r   )r!  r\   r   r9   r<   r   r  r   r   r   r   r   )r   r>  r?  r   r   r!  rn   rt  r   r   prev_lr   nb_query_blocksnb_key_blockss                 r1   r   JukeboxAttention.prev_block_attn  sm   NN	).&
Yq[Y.Eai9,Fqy!Vy&88!;<a9*<!<a?@kk*<<_d_j_jkJ9\\afalalm??5u=BB:qR[\\ ;;q>LJJz8IEy\E((:)';YRSTVYWYVY[\^_S_`C((%%))#/ABC((:/9<iSCJJzi+?VWXZ][]Z]_`bcWcdEHH''++E3EFEJJz3y@)WE%"."; ' 4&hhz)OPQTcScSdPdenn&++J,H)_

:iSTUXgWgWhThi((*//
0Lic??5u=BB:Xabbr3   c                    U R                   nU R                  nUR                  u  pxn	U(       a  US S 2US-
  XV-  S-
  U2S S 24   n[        R                  R
                  R                  US5      nUS S 2US-
  XV-  S-
  U2S S 24   n[        R                  R
                  R                  US5      nU R                  XX45      R                  USU	5      $ UR                  XuX-  U	5      S S 2S S2SS S 24   n[        R                  R
                  R                  US5      nUR                  XuX-  U	5      S S 2S S2SS S 24   n[        R                  R
                  R                  US5      nU R                  XX45      R                  XxU	5      $ )Nr   r   r   )	r   r!  r\   r   r   r   r   r  r   )
r   r>  r?  r   r   r   r!  rn   rt  r   s
             r1   r  JukeboxAttention.summary_attn  sv   NN	).&
YaQ);a)?)KQNOC((%%))#|<C!Y]V-?!-CiOQRRSEHH''++E<@E??5u=BB:qR[\\((:w/@)LQPSQSPSUWYZ][C((%%))#|<CJJz73DiPQRTWUWTWY[]^Q^_EHH''++E<@E??5u=BB:Xabbr3   c                 v   U R                   nU R                  nUR                  u  pxn	U(       a  [        eUR	                  XuX-  U	5      S S 2S S2U* S 2S S 24   n[
        R                  R                  R                  US5      R                  5       nUR	                  XuU-  U	5      nUR	                  XuX-  U	5      S S 2S S2U* S 2S S 24   n[
        R                  R                  R                  US5      R                  5       nUR	                  XuU-  U	5      nU R                  XX45      R	                  XxU	5      $ )Nr   rH  )r   r   r\   NotImplementedErrorr   r   r   r   r   r   r  )
r   r>  r?  r   r   r   r   rn   rt  r   s
             r1   r  $JukeboxAttention.summary_spread_attn	  s#   ).&
Y%%((:w/@)LQPSQSPSV\U\U]_`M`aC((%%))#/ABMMOC((:	BCJJz73DiPQRTWUWTWZ`Y`YacdQdeEHH''++E3EFQQSEJJzF?IFE??5u=BB:Xabbr3   c                 j    U R                   nUS S 2S U24   nUS S 2S U24   nU R                  XX45      $ r   )_encoder_lenr  )r   r>  r?  r   r   r$  s         r1   r  JukeboxAttention.prime_attn  sB    ''!\k\/"a+o&u599r3   c                    UR                   S   nUb  [        S5      eUR                  SSS9u  pVnU(       a  U =R                  U-  sl        U R	                  Xg5      u  pgU R                  5       nU R                  5       U:  a  U R                  U* 5        US:  aE  U R                  S:w  a2  U R                  USS9nU R                  U5      nU R                  U5      nS	nOU R                  S
   nU R                  S   nXVXs4$ )Nr   )last_encoder_hidden_states should be NonerZ   r6   r   r  T)r>  Fr?  r   )r\   	TypeErrorrb   r   _append_cache_suff_cache_len
_cache_len_slice_cacher  _pad_to_block_ctxr"  )	r   r   last_encoder_hidden_statesr   curr_ctxr>  r?  r   l_caches	            r1   r  JukeboxAttention.factored_qkv!  s    &&q)%1GHH)//q/9EMMX%M++C7JC**,G 7*!!7(+!|>>\1 2252EE005C 2259Ejj'

7+5((r3   c                    UR                   S   nUb  [        S5      eUR                  SSS9u  pVnU(       a  U R                  5       U R                  :  a  U R                  Xg5        U R                  5       U R                  :  a  U R                  SU R                  5        U R                  S   U R                  S   pvU =R                  U-  sl        XVXs4$ )	Nr   rV  rZ   r6   r   r   r?  r   )	r\   rW  rb   rZ  rS  rX  r[  r"  r   r   r   r]  r   r^  r>  r?  r   s           r1   r  JukeboxAttention.prime_qkv8  s     &&q)%1GHH)//q/9E 4#4#44""3. 4#4#44!!!T%6%67E*DJJw,?MMX%M5((r3   c                    UR                   S   nUnU(       a  U R                  S:X  aK  U R                  UR                  U5      5      R	                  SSS9u  U R
                  S'   U R
                  S'   U R
                  S   U R
                  S   pvU =R                  U-  sl        O0U R                  UR                  U5      5      R	                  SSS9u  pgXVXs4$ )Nr   r   r6   r   r?  r   )r\   r   r  r   rb   r"  rb  s           r1   r  JukeboxAttention.decode_qkvF  s     &&q)}}!9=.66}E:%q%/ 7

5!4::g#6 E*DJJw,?MMX%M'A'I'I-'XY__`agh_iJC5((r3   c                 T   UR                   S   nU R                  U5      nU R                  XUS9u  pVpsU R                  XVXs5      nUR                   S   U:w  a/  U R	                  U5      n	US S 2XU-   2S S 24   R                  5       nU R                  U5      nU R                  U5      $ )Nr   r]  r   )r\   r  r  r  _offsetr   r  r  )
r   r   r]  r   r^  r>  r?  r   attention_scoresrE   s
             r1   r   JukeboxAttention.forwardT  s     &&q)M2$(HHY_ %- %
!E  99U?!!!$0\\(+F/6X<M3Mq0PQ\\^;;'78!!"233r3   c                 X    U R                   nXR                  -  S-   nX R                  -  $ r  )r$  r   )r   r$  encoder_blockss      r1   rS  JukeboxAttention._encoder_lena  s+    &&%49++r3   c                 \    U R                   S:X  a  gU R                  U-
  U R                  -  $ )Nr  r   )r  r   r!  )r   r^  s     r1   rh  JukeboxAttention._offsetg  s)    >>\)(DNN::r3   c                    UR                   S   nU(       a  U R                  U5      OSnX4-   U R                  -   S-
  U R                  -  nXPR                  -  U-
  U-
  nUS:X  a  US:X  a  U$ [        R                  " USSXF45      $ )Nr   r   )r\   rh  r!  r#   r   )r   r   r>  rt  rE   n_blocksr   s          r1   r\  "JukeboxAttention._pad_to_block_ctxl  s    %%a(*/g&Q$t~~59dnnL''1F:!8!  55Av(;<<r3   c                 ^    SU R                   ;  a  S$ U R                   S   R                  S   $ )Nr?  r   r   )r"  r\   r   s    r1   rZ  JukeboxAttention._cache_lenv  s,    +qKE1B1H1H1KKr3   c           	         U R                   S-
  U R                  -  S-   U R                  -   nU R                   U R                   S-
  U R                  -  S-   U R                   U R                   U R                  ::  a  U R                   OUU R                  [        U R                   U R                  5      S.nX R
                     $ )z
Precondition:
    key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
    location in the context.
r   )r  r  r  r  
cross_attnr  )r   r!  r$  r   rS  r  )r   previous_block_lengthREQUIRED_CACHE_LENs      r1   rY   JukeboxAttention._suff_cache_leny  s     "&!2dnn Dq H4>> Y--==1,>B$(MM040Ot}}Uj**dmmT->->?
 "..11r3   c                     U R                   S   S S 2X24   U R                   S'   U R                   S   S S 2X24   U R                   S'   g )Nr?  r   )r"  )r   rQ   ru   s      r1   r[  JukeboxAttention._slice_cache  sF     JJu-al;

5"jj1!UY,?

7r3   c                    SU R                   ;  a  XR                   S'   X R                   S'   OXpC[        R                  " U R                   S   U/SS9n[        R                  " U R                   S   U/SS9nU R                   S	 U R                   S	 AAXR                   S'   X R                   S'   U R                   S   U R                   S   4$ )Nr?  r   r   r   )r"  r   r8   )r   r?  r   old_key	old_values        r1   rX  JukeboxAttention._append_cache  s    

" #JJu"'JJw!$Y))TZZ.8a@CIItzz'2I>AFE

5!

7# #JJu"'JJwzz% $**W"555r3   c                     SU l         SU R                  ;   a  U R                  S	 SU R                  ;   a  U R                  S	 0 U l        g )Nr   r?  r   )r   r"  rt  s    r1   	del_cacheJukeboxAttention.del_cache  s@    DJJ

5!djj 

7#
r3   )r,  r  r  r  r   r!  r   r  r  r  r"  r  r   r$  r  r   r   rN   r  r  r%  r  r   r  r   r  r   NFr   )r   r   r   r   r   r3  r7  r;  r  r  r  r  r  r  r  r  r  r  r   propertyrS  rh  r\  rZ  rY  r[  rX  r  r   r   r   s   @r1   r	  r	  2  s    4!l D<

5c :#cJc&c$:).))4 , ,
;
=L2$@6  r3   r	  c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )JukeboxBlocki  c                 D  > [         TU ]  5         UR                  U l        [	        XUS9U l        [        UR                  5      U l        [        U5      U l	        [        UR                  5      U l
        UR                  (       a  SUR                  -  OSU l        X0l        g )Nr  r   )r   r   r  r  r	  r  r  layer_norm_0r  mlplayer_norm_1attn_res_scale
num_layersr   r  )r   rl   rN   r  r   s       r1   r   JukeboxBlock.__init__  s|    ''
$ViH	,V-?-?@f%,V-?-?@4:4I4Iv000s"r3   c                     UnU R                  U5      nU R                  XU5      nU R                  XA-   5      nU R                  U5      nU R                  S:X  a	  XA-   U-   nU$ X@R                  X-   -  -   nU$ )Nr   )r  r  r  r  r   )r   r   r]  r   r   output_statesoutputs          r1   r   JukeboxBlock.forward  s    !	))-8		-VT)))*CD/>>S .>F  =3P!QQFr3   )r  r  r  r  r  r   r  r  r   r   r   s   @r1   r  r    s    	# r3   r  c                   >   ^  \ rS rSrU 4S jrS rSS jrS rSrU =r	$ )JukeboxLayerStacki  c           
      ,  > [         TU ]  5         X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  b  X R                  -  U l        UR                  U l
        UR                  U l        [        U R                     n[        R                  " 5       U l        [!        U R
                  5       H,  nU R                  R#                  [%        XU" U5      S95        M.     / U l        g )Nr  )r   r   rN   r  r  r  r   attention_patternr!  r#  r$  r  r   r   r   
_attn_modsr>   rM   r  saved_attn_weights)r   rl   rN   r  r   r   s        r1   r   JukeboxLayerStack.__init__  s    
''
 ++mm!'!9!9;;""kk1DN!::~~ /t/E/EF--/4??+EOO""<IZ[`Ia#bc , #%r3   c                    ^ U4S jn[        U R                  5       H  u  p4U" U5      UR                  l        M     T(       d  / U l        gg)z
Makes forward prop dump self-attention softmaxes to self.saved_attn_weights.

Args:
    record_attn (`Union[bool,set]`):
        Either a set of layer indices indicating which layers to store, or a boolean value indicating Whether
        to dump all.
c                 <   > [        T[        5      (       a  T$ U T;   $ r   )r  r&   )	layer_idxr%  s    r1   _should_record_attn>JukeboxLayerStack.set_record_attn.<locals>._should_record_attn  s!    +t,,""++r3   N)	enumerater  r  r%  r  )r   r%  r  r   layers    `   r1   set_record_attn!JukeboxLayerStack.set_record_attn  sB    	,
 "$//2HA%8%;EJJ" 3 &(D# r3   c                 .   [        U R                  5       H{  u  pEUR                  S:X  a  U" XUS9nOU" US US9nUR                  R                  (       d  MB  U R
                  R                  UR                  R                  R                  5        M}     U$ )Nr  rg  )	r  r  r  r  r%  r  rM   r  r   )r   r   r]  r   r   
attn_layers         r1   r   JukeboxLayerStack.forward  s    &t7MA##'88 *!ag! !+=UYbh i***''..z/E/E/L/LM 8 r3   c                 ^    U R                    H  nUR                  R                  5         M     g r   )r  r  r  )r   r  s     r1   r  JukeboxLayerStack.del_cache  s     //JOO%%' *r3   )
r  r  r!  r   r$  rN   r  r  r  r  r  )
r   r   r   r   r   r  r   r  r   r   r   s   @r1   r  r    s    %()*( (r3   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JukeboxPositionalEmbeddingi  c                    > [         TU ]  5         [        R                  " [        R
                  " X45      5      U l        g r   )r   r   r   r   r   r   pos_emb)r   r   r  r   s      r1   r   #JukeboxPositionalEmbedding.__init__  s*    ||EKK0B$CDr3   c                     U R                   nU$ r   r  )r   r  s     r1   r   "JukeboxPositionalEmbedding.forward  s    ,,r3   r  r   r   s   @r1   r  r    s    E r3   r  c                      ^  \ rS rSr     S	U 4S jjr      S
S jrS r        SS jrS r         SS jr	Sr
U =r$ ) JukeboxConditionalAutoregressivei  c                 r  > [         TU ]  5         UR                  U l        UR                  U l        Ub  UOUR
                  U l        Ub  UOUR                  U l        [        R                  " U R                  UR                  5      U l
        [        R                  " UR                  5      U l        XPl        X@l        U(       d;  [        R                   " ["        R$                  " SUR                  45      5      U l        [)        U R
                  UR                  5      U l        [        R                  " UR                  5      U l        [/        XR
                  S9U l        X`l        UR4                  U l        UR8                  (       a  SU l        SU l        OSU l        SU l        U(       d  [        R>                  " UR                  U R                  SS9U l         U R<                  (       a%  U R                  RB                  U R@                  l!        ["        R                  RE                  5       U l#        gg)a  
Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
set for each configuration.

Args:
    config (`JukeboxPriorConfig`):
        Model configuration class with all the parameters of the model. Initializing with a config file does
        not load the weights associated with the model, only the configuration. Check out the
        [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    n_ctx (`int`, *optional*):
        Number of tokens or lyrics tokens provided in a single pass.
    embed_dim (`int`, *optional*):
        Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension,
        if the model combines lyrics and music tokens, or simply n_vocab if the model is a separate encoder
    audio_conditioning (`bool`, *optional*, defaults to `False`):
        Whether or not the prior supports conditioning on audio.
    metadata_conditioning (`bool`, *optional*, defaults to `False`):
        Whether or not the prior supports conditioning on artitst, genres, lyrics and timing.
    is_encoder (`bool`, *optional*, defaults to `False`):
        Whether the model is an encoder only model.
Nr   )rN   FTr   )$r   r   r  r  r  rN   music_vocab_sizer   r   r  embed_tokensr  emb_dropoutembed_tokens_dropoutmetadata_conditioningaudio_conditioningr   r   r   start_tokenr  r  pos_emb_dropoutr  transformer
is_encoderr#  r$  merged_decoderadd_cond_after_transformershare_embed_tokens_fc_proj_outr  fc_proj_outr   CrossEntropyLossr  )r   rl   rN   r   r  r  r  r   s          r1   r   )JukeboxConditionalAutoregressive.__init__	  s   > 	''
 ++#/UV\\
&/&;AXAXLL9K9KL$&JJv/A/A$B!%:""4$!||EKKF<N<N8O,PQD1$**f>P>PQ!zz&*<*<=,V::F$!::  .3D+27D/.2D+26D/!yy););T^^RWXD22*.*;*;*B*B  '113DI	 r3   c                    UR                   S   n[        R                  " 5          UR                  US5      R	                  5       nSSS5        U R
                  (       dl  [        R                  " USU R                  4UR                  U R                  R                  S   R                  R                  R                  R                  S9nUn	U R                  U5      n
[        R                   " U
SS2SS24   U
SS2SS24   4SS9n
U R"                  (       a#  UR                  XR                  5      U
SS2S4'   OU R$                  U
SS2S4'   U R'                  U
5      U R)                  U R+                  5       5      -   U-   n
U R                  XS9n
U R,                  (       a  X-   n
U
nU R.                  (       a  U
$ U R1                  U
5      n
[2        R4                  " 5       nU(       a  U
SS2SU R6                  24   R9                  SU R:                  5      nU
SS2U R6                  S24   R9                  SU R:                  5      nU" XSS2SU R6                  24   R9                  S5      5      [<        R>                  " S5      -  nU" XSS2U R6                  S24   R9                  S5      5      [<        R>                  " S5      -  nUU4nOIU" U
R                  SU R:                  5      U	R                  S5      5      [<        R>                  " S5      -  nU(       a  UU
4$ U(       a  UU4$ US4$ ! , (       d  f       GN= f)	z
Args:
    tokens (`torch.tensor`):
        Can represent music tokens, lyrics tokens or both, depending on the configuration.
r   r   Nr   rY   r   )r]  r5   ) r\   r   rD  r   r:   r  r9   r  r<   r  r  r  r  r   r   r  r8   r  r  r  r  r  r  r  r  r   r  r$  reshaper   rg   rG  )r   rG   r  r  r]  	get_predsget_actsget_sep_lossrn   targetr   activationsloss_fnlyric_hidden_statestoken_hidden_states
lyric_lossmusic_token_lossr  s                     r1   r   (JukeboxConditionalAutoregressive.forwardH  s     \\!_
]]_[[R0557F  &&!&Q

+}}&&11!488==DDJJ" ))&1		=BC#8-3B3:O"PVWX%%"7"<"<Z"TM!Q$"&"2"2M!Q$ %%m4t7K7KDLLN7[[^pp 	 (( ) 
 **)>M#??  ((7%%'"/3ET5E5E3E0E"F"N"NrSWSaSa"b"/43C3C3E0E"F"N"NrSWSaSa"b !4Q@R$BRBR@R=R6S6[6[\^6_`cecicijmcnnJ&':1dFVFVFXCX<Y<a<abd<efikioiopsitt 01D=--b$..A6;;r?SVXV\V\]`VaaD&&$$:g _s   !L00
L?c                 D   US:X  a  [         R                  " USU R                  U R                  R                  R
                  S9R                  U R                  R                  R                  5      nU R                  (       a#  UR                  X R                  5      US S 2S4'   O%U R                  US S 2S4'   OU R                  U5      nUR                  X R                  U R                  4:X  a  US S 2XS-   2S S 24   nOUnX`R                  5       XS-    -   U-   nXg4$ )Nr   r   r   )r   r   r  r  r   r   r;   r<   r  r   r  r\   rN   r  )r   r   r  rG   r  r  r   conds           r1   get_emb(JukeboxConditionalAutoregressive.get_emb  s    q=!KK	1djjHYHYH`H`HfHfgjj!!((//M ))&;&@&@JJ&Wad#&*&6&6ad# --f5M##	::tzz'JJ%aqL)@!&CDD%D%xQ,(OORVV""r3   c
           	      l   U	c  U R                   n	U R                  (       d  [        R                  " USU R                  4U R
                  R                  S   R                  R                  R                  R                  S9R                  U R                  R                  5      n[        R                  " 5          / n
S nU(       a  / n[        [!        SU	5      SS9nU H  nUR#                  SU	 S3SS	9  U R%                  XXU5      u  nnU R                  XSS
9nU R&                  (       a  UU-   nU R                  U5      nU(       a  WR)                  UR+                  5       5        X-  n[-        XUS9n[        R.                  R1                  US9R3                  5       nU
R)                  UR+                  5       5        M     AU R
                  R5                  5         [        R6                  " U
SS9nU(       a  [        R6                  " WSS9nS S S 5        U(       a  WW4$ W$ ! , (       d  f       N= f)Nr   r   r   FleavezAncestral sampling  music tokensTrefreshrg  r)   r*   r(   r   )rN   r  r   r9   r  r  r  r  r  r   r   r;   r  r<   rD  r   r>   set_descriptionr  r  rM   r   r2   distributionsCategoricalr   r  r8   )r   r  r  r  r]  tempr)   r*   r  sample_tokensr  rG   predsiterr   r   r  s                    r1   r   'JukeboxConditionalAutoregressive.sample  s      JJM&&!&Atzz*$2B2B2M2Ma2P2T2T2Y2Y2`2`2f2f"b!!(()  ]]_NFa/u=D $$':=/%Wae$f&*llEZ'#t !% 0 0!ae !1 ! 22$1D$8M $ 0 0 ?LL!4!4!67 - 4 -mPU V,,888NUUW%%flln5' !* &&(YY~15F		%Q/C D 5= MK _s   :EH%%
H3c                 F    X-   S-
  U-  n/ U/US-
  -  QUS-
  U-  S-   PnU$ r  r}  )r   length
chunk_sizen_passeschunk_sizess        r1   split_chunks-JukeboxConditionalAutoregressive.split_chunks  sC    '!+
:U15U
j7PST7TUr3   c                    Uc  U R                   nUR                  S   n[        R                  " 5          UR	                  US5      R                  5       nS S S 5        [        R                  " USSS9n[        U5      nU R                  (       dz  [        R                  " USU R                  4U R                  R                  S   R                  R                  R                  R                   S9R#                  UR$                  5      n[        R                  " 5          U	(       a  / nU
c  ['        U5      n
U R)                  ['        U5      U
5      n/ nSnS n[+        USSS9 H  n/ / nn[-        UUU-   5       HA  nU R/                  UUUX45      u  nnUU   nUR1                  U5        UR1                  U5        MC     UU-   n[        R2                  " USS9[        R2                  " USS9nnAAU	(       d  AU R                  UUS	S
9nU	(       a*  U R4                  (       a  UW-   nAUR1                  U5        M  AM     U	(       a7  [        R2                  " USS9nU R7                  U5      nWR1                  U5        US   n[+        [-        ['        U5      U5      S['        [-        ['        U5      U5      5       S3SS9nU H  nU R/                  UUUX45      u  nnU R                  UUS	S
9nU R4                  (       a  UU-   nU R7                  U5      nU	(       a  WR1                  U5        UU-  n[9        UXxS9n[        R:                  R=                  US9R?                  5       nUR1                  URA                  5       5        UnM     AAU R                  RC                  5         [        R2                  " USS9nU	(       a  [        R2                  " WSS9nS S S 5        U	(       a  WW4$ W$ ! , (       d  f       GN= f! , (       d  f       N-= f)Nr   r   r   r   r   zPreparing past key valueF)rU   r  Trg  	Sampling r  r  r  )"rN   r\   r   rD  r   r:   splitr=   r  r9   r  r  r  r  r  r   r   r;   r<   r7   r  r   r>   r  rM   r8   r  r  r2   r  r  r   r   r  )r   r  lyric_and_music_tokensr  r  r]  r  r)   r*   r  r  r  rn   sampled_audior  r  x_primesrQ   r  current_chunk_sizesampled_audio_primeconds_primer   x_prime
cond_primeinput_tokens
itereratorr   r  ri   s                                 r1   primed_sample.JukeboxConditionalAutoregressive.primed_sample  s      JJM+11!4
]]_%;%@%@R%P%U%U%W"  $:A1E]+&&!&Atzz*$2B2B2M2Ma2P2T2T2Y2Y2`2`2f2f"b'../  ]]_ ! /
++C,>
KKHEE&*;=W_d&e"35r[# %eU5G-G HH*.,, )U4F+'GZ *(3E'..w7&&z2 !I  22&+ii0C&KUYYWbhiMj' "**7Oirv*w66")J"6"OOG,/ 'f2 ))H!4**73W% ),Lc-(-8 U3}+=}%M!N O}]J
 '&*lli7I'#t !% 0 0!>Xae !1 ! 22$1D$8M $ 0 0 ?LL/ - 4 -m5 V$22>>m>T[[]$$\%7%7%9:+' '* l&&( 99]:L		%Q/[ \ &&y _ _s   !O+J8O=+
O:=
P)r  r  r   r  r  r$  r  r  r  r  rN   r  r  r  r  r  r  r  )NNFFF)NNNFFF)NNNr   r   r   FN)	NNNr   r   r   FNN)r   r   r   r   r   r   r  r   r  r  r   r   r   s   @r1   r  r    s      #=4D  "#'DL#,  "#'9v  "#'n  n r3   r  c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )JukeboxMusicTokenConditioneriR  z
The `JukeboxMusicTokenConditioner` takes music tokens as an input (corresponding to the codes of the VQVAE's
codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE).
c           
      p  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        UR                  Ul        [        UUR
                  UR                  UR                  UR                  U   UR                  U   SS9U l        [        UR
                  5      U l        g )NF)r   )r   r   r   r  r  r  r  r   r  r  r  r  r  	upsamplerr  r  )r   rl   rm   r   s      r1   r   %JukeboxMusicTokenConditioner.__init__X  s    LL)@)@&BTBTU!22/!!!!u%  '"
 +6+=+=>r3   c                     Uc  SnUR                  5       nU R                  U5      nX2-   nUR                  SSS5      nU R                  U5      nUR                  SSS5      nU R	                  U5      nU$ )a  
Args:
    music_tokens (`torch.LongTensor`):
        Music tokens form the upper level in range(nb_discrete_codes)
    raw_audio_conditioning (`torch.LongTensor`, *optional*):
        Audio used when primed sampling, raw audio information that conditions the generation
r   r   r6   r   )r:   r  rV  r   r  )r   ri   raw_audio_conditioningr   s       r1   r   $JukeboxMusicTokenConditioner.forwardh  s     ")%("#((*)),7%> &--aA6}5%--aA66r3   )r  r  r   r   	r   r   r   r   __doc__r   r   r   r   r   s   @r1   r  r  R  s    
?  r3   r  c                   :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )JukeboxRangeEmbeddingi  a{  
The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of time positional
embedding of length `n_ctx`.

Binning process : For each pos in position tensor, find its bin [start,end) mapped to [0,1,...,bins-1] [start,end)
-> [0,1) -> [0, bins) -> floor -> [0,...,bins-1] NOTE: Open ended interval on right, so start <= pos < end, not <=
end
c                    > [         TU ]  5         Xl        X l        [        R
                  " X$5      U l        Uu  U l        U l        XPl	        g r   )
r   r   n_timer   r   r  embpos_minpos_maxr   )r   r
  r   r>   	out_widthr   r   s         r1   r   JukeboxRangeEmbedding.__init__  s;    "<<	5%*"dl
r3   c                    [        UR                  5      S:X  d  [        SUR                   35      eU R                  U:*  R	                  5       (       dI  XR
                  :  R	                  5       (       a(  [        SU R                   SU R
                   SU 35      eUR                  5       nUbG  U R                  (       a&  UR                  U R                  U R
                  5      nUR                  5       nU R                  nUS:w  aM  [        R                  " SU[        R                  UR                  S9R                  SU5      U-  nXU-
  U-  -   nOUnXPR                  -
  U R
                  U R                  -
  -  nU R                  U-  R                  5       R                  5       R!                  5       nU R#                  U5      $ )	Nr6   z Expected shape with 2 dims, got z
Range is [,z), got r   r   rT   )r7   r\   rW  r  allr  re   r   r
  r   aranger<   r   r   floorr:   r~  r  )r   	pos_startpos_endr
  interpolationpositionnormalised_positionbins_s           r1   r   JukeboxRangeEmbedding.forward  sr   9??#q(>y>OPQQ	)..00i,,6N5S5S5U5Uja~WYKXYYOO%	zz!--dllCmmoGQ;Qekk)BRBRSXXYZ\bcfll  !i$7=#HHH H  (,,64<<$,,;VW"55<<>CCELLNxxr3   )r   r  r   r
  r  r  r   r   r  r   s   @r1   r  r    s     r3   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JukeboxLabelConditioneri  c                 $  > [         TU ]  5         UR                  nUR                  nUR                  nUR
                  u  pgUR                  nUR                  U l        [        R                  " Xc5      U l
        [        R                  " Xs5      U l        X l        U R                  (       ae  UR                  U-  UR                  U-  4n	SUR                  U-  4n
Sn[        SXIU5      U l        [        XX5      U l        [        XXSS9U l        g g )Nr   )r   r   r   T)r   )r   r   r  timing_dimsr  metadata_dimsrN   max_nb_genresr   r  bow_genre_emb
artist_embinclude_time_signalmin_durationmax_durationr  total_length_embabsolute_pos_embrelative_pos_emb)r   rl   r$  r   r  r  	nb_genres
nb_artistsr  total_length_rangeabsolute_pos_rangerelative_pos_ranger   s               r1   r    JukeboxLabelConditioner.__init__  s   &&	((,, & 4 4	#\\#11\\)?,,z=#6 ##"("5"5"EvGZGZ]jGj!k"%v':':]'J!K!+$9![^g$hD!$9"1C%D! %:"1CVZ%D! $r3   c                 P   US S 2SS24   nUS S 2SS24   nUS S 2SS24   nUS S 2SS24   nUS S 2SS 24   nU R                  U5      nUS:  R                  5       R                  S5      nU R                  UR	                  S5      5      U-  R                  SSS9n	X-   n
U R                  (       ar  X3U-   pUR                  5       nUR                  5       nUR                  5       nU R                  U5      U R                  X5      -   U R                  X-  X-  5      -   nX4$ S nX4$ )Nr   r   r6   rZ   r   Trd  )
r#  re   rA   r"  r   rF  r$  r'  r(  r)  )r   rv   rD   rE   r  artistgenrer#  r   	genre_emb	start_embrQ   ru   r  s                 r1   r   JukeboxLabelConditioner.forward  sQ   1Q3'!QqS&!!QqS&!!QqS&!AB __V,

!!#--a0''A7$>CCSWCX	*	 ##&3'--/LKKME))+C%%l3''34''(<c>PQR  !! G!!r3   )r(  r#  r"  r$  r!  r)  r'  r   r   s   @r1   r  r    s    2" "r3   r  c                   H  ^  \ rS rSr% Sr\\S'   S rSS\4U 4S jjjrSS jr	S	 r
S
 rS rS rS rSS jrSS jrS r        SS jrSS jrS r/ SSS4S jr  S S\R.                  S\\\R4                        S\\   S\\   S\\R.                     4
S jjrSrU =r$ )!JukeboxPriori  u7  
The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be
seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù
is defined, it also models the `next character` prediction on the lyrics. Can be conditioned on timing, artist,
genre, lyrics and codes from lower-levels Priors.

Args:
    config (`JukeboxPriorConfig`):
        Model configuration class with all the parameters of the model. Initializing with a config file does not
        load the weights associated with the model, only the configuration. Check out the
        [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    level (`int`, *optional*):
        Current level of the Prior. Should be in range `[0,nb_priors]`.
    nb_priors (`int`, *optional*, defaults to 3):
        Total number of priors.
    vqvae_encoder (`Callable`, *optional*):
        Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of
        the vqvae module to avoid getting the parameters.
    vqvae_decoder (`Callable`, *optional*):
        Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of
        the vqvae module to avoid getting the parameters.
rl   c                    U R                   R                  n[        U[        R                  5      (       a)  UR
                  R                  R                  SSU-  S9  GO7[        U[        5      (       aj  U R                   R                  (       a&  UR
                  R                  R                  5         GOUR
                  R                  R                  SSU-  S9  GO[        U[        5      (       a)  UR                  R                  R                  SSU-  S9  GOz[        U[        5      (       a3  UR                  R
                  R                  R                  SSU-  S9  GO2[        U[        5      (       aC  [!        US5      (       a2  UR"                  R
                  R                  R                  SSU-  S9  O[        U[        5      (       a9  [!        US5      (       a(  UR$                  R                  R                  SSU-  S9  O[        U[&        5      (       aw  U R                   R                  (       a\  UR(                  R
                  R                  R                  5         UR(                  R*                  R                  R                  5         [        U[        R,                  5      (       aI  UR*                  R                  R                  5         UR
                  R                  R/                  S5        [        U[        R0                  5      (       a3  UR*                  b%  UR*                  R                  R                  5         g g g )Nr   r  r  r2  lm_headr  r   )rl   r  r  r   r  r   r  r  r   r  r  r  r  r  r  r  hasattrr9  r  r   r   r   r   r  r  )r   r  r  s      r1   r  JukeboxPrior._init_weights  sH   [[++
fbll++MM&&CTJ5F&G..{{##""((*""**
9J*K :;;NN''SdZ6G'H 566JJ""**
9J*K @AAgfV_F`F`NN!!&&..CTJ=N.O @AAgfVcFdFd##++$:K+L 5664;;;O;OOO""''--/OO  %%++-fbll++KK""$MM$$S)fbii((V[[-DKK""$ .E(r3   Nc           
      	  > [         T	U ]  U5        X@l        XPl        X0l        Ub  UOUR
                  U l        SU R
                   3U l        UR                  U l        UR                  S:  U l	        UR                  U l        UR                  U l
        U R
                  S:g  U l        U R
                  S-
  U l        U R                  (       a  [        XR
                  5      U l        UR                  U l        U R                  (       a  [!        XR                  (       + S9U l        UR$                  U l        UR$                  (       a  UR                  UR                  /U l        SUR(                  /U l        UR,                  U l        UR                  U l        [1        UUR                  UR                  -   UR(                  UR2                  -   U R                  =(       d    U R                  SS9U l        GORUR6                  nU R                  S:w  a  U R                  (       a  UR,                  U l        UR,                  U l        UR(                  U l        [1        UU R                  U R<                  SSSS9U l        [A        UR,                  UR,                  5      U R>                  l!        [E        UR,                  5      U R>                  l#        [H        RJ                  " UR,                  UR(                  SS	9U R>                  l&        OSU l        [1        UU R                  =(       d    U R                  U R                  S
9U l        UR                  U l'        U R                  U RN                  -   U l(        [S        URT                  URV                  5       VVs/ s H	  u  pxXx-  PM     snnU l,        U R
                  S:w  a  U RX                  U R
                     OS U l-        [\        R^                  " U RX                  S X0R
                  -
   5      U l0        U R                  U R`                  -  U l1        [d        Rg                  SU R
                   SU RZ                   SU R`                   SU Rb                   35        g s  snnf )Nzpriors.r   r   )r$  T)rN   r   r  r  F)rN   r   r  r  r  r  )r  r  zLevel:z, Cond downsample:z, Raw to tokens:z, Sample length:)4r   r   vqvae_encodervqvae_decoderr[   rm   r  rN   r#  lyric_conditioningencoder_loss_fractionr  
cond_levelr  conditioner_blocksr  r  metadata_embeddingis_encoder_decoderinput_shapeslyric_vocab_sizeembed_dim_shiftr  r  r  r  rk   encoder_configlyric_acts_widthencoder_widthencoder_dimr  r   r  r  final_layer_normr   r  r9  next_token_prediction_loss_dimstotal_loss_dimsrc   r  r  r  cond_downsamplerg   rJ  raw_to_tokensra   loggerinfo)
r   rl   rm   	nb_priorsr=  r>  rH  r  r  r   s
            r1   r   JukeboxPrior.__init__  s     +*#/UV\\
#*4::,!7\\
"("A"AA"E(.(G(G%%+%A%A" #'**/**q.""&B6::&VD# &,%A%A"%%&=f^u^uZu&vD# #)";";$$!'!@!@&,, OD$%v'>'>#?D ++DJ,2,K,KD)955D 11F4K4KK$($;$;$Yt?Y?Y&*DJ $22N,,1d6M6M(6(B(B%%+%7%7"#)#:#: ?"77"..',*/#  (5^5O5OQWQcQc'd$0@ASAS0T-')yy1C1CVE\E\ch'i$01- :$($;$;$Yt?Y?Y&*&@&@DJ 06||,#<<t?c?cc=@AUAUW]WiWi=jk=j\VFL=jk?CzzQt//

;TXWWT%5%56N	JJ8N%OP!ZZ$*<*<<TZZL 243G3G2HHXY]YkYkXl m))*,	
 ls   RFc                    UR                  5       nX6S S 2S4'   [        U R                  5      US S 2S4'   [        X@R                  -  5      [        X R                  -  5      -   US S 2SS24'   U R	                  U5      u  pgU(       a  Xg4$ U$ rx  )r   r?   ra   rP  set_metadata_lyric_tokens)r   rj   rQ   rD   rE   rV   rv   rH   s           r1   r`   JukeboxPrior.get_metadatau  s    <<>%AT//0A v(:(::;c%J\J\B\>]]AaC !::8D$$Or3   c                 T   U R                   S:  Ga  [        R                  " UR                  S   U R                   4[        R                  UR
                  S9n/ n[        UR                  S   5       Hv  nUR                  5       SS2SU R                  R                  -   S24   nXS4   XS4   XS4   pn[        XPR                   XgU5      u  pXUSS24'   UR                  U
5        Mx     [        R                  " USS2SSU R                  R                  -   24   U4SS9U4$ US4$ )	zq
Processes the full labels to only retrieve the relevant lyric tokens and keep the metadata conditioning tokens.
r   rT   Nr   r   r6   r   r   )r#  r   r9   r\   r:   r<   r>   r   rC  r!  rJ   rM   r8   )r   rj   tokens_listindices_listidxrB   rD   rE   rF   rG   rH   s              r1   rV  &JukeboxPrior.set_metadata_lyric_tokens  s4    ((1,++a$"?"?@

[a[h[hK LV\\!_-$llnQD4K4K4Y4Y0Y0[-[\17QQQW]^X^Q_h";!>!>V^# '-CF###G, . 		6!%Pq4+B+B+P+P'P%P"PQS^_egh 
 4<r3   c                    U R                   S:w  a  XR                   S-
     nUSS2X R                  -  X0R                  -  24   nU R                  U R                  -  US   R                  S   -
  nUS:  aT  [        R
                  " SU5      R                  UR                  5      n[        R                  " XF4SS9R                  5       nU/nU$ SnU$ )z5
Extracts current level's conditioning music tokens.
r   r   Nr   r   )
rm   rO  rN   r\   r   r9   r;   r<   r8   r:   )r   ri   rQ   ru   music_tokens_condmissing_cond_len	init_condmusic_tokens_condss           r1   get_music_tokens_conds#JukeboxPrior.get_music_tokens_conds  s     ::? ,ZZ!^ <,Q9M9M0MPSWkWkPk0k-klL#zzT-A-AADUVXDYD_D_`bDcc!#!KK+;<??@Q@X@XY	$)II/@.LRT$U$Z$Z$\!"3!4 "! "&!!r3   c                    US   R                   S   n[        [        U5      5       H2  nX   [        U R                  U   5      -   R                  US5      X'   M4     [        [        U5      5       HU  nX$   b  M
  [        R                  " X0R                  U   U R                  4US   R                  US   R                  S9X$'   MW     [        R                  " USS9[        R                  " USS94$ )z
Shifts the input tokens to account for the dictionary merge. The embed_dim_shift give by how much the music
tokens should be shifted by. It is equal to `lyric_vocab_size`.
r   r   rT   r   r   )r\   r>   r7   r?   rG  r   r   r9   rE  r  r   r<   r8   )r   rG   condsrn   r   s        r1   prior_preprocessJukeboxPrior.prior_preprocess  s    
 AY__Q'
s6{#AS)=)=a)@%AAGG
TVWFI $ s5z"Ax ;;!2!21!5tzzB&QR)//bhijbkbrbr # yyQ'5a)@@@r3   c                    UR                   S   nU R                  S   UR                   S   U R                  S   -
  4n[        [        R                  " XSS95      n[        [        U5      5       HM  n[        U R                  U   5      nX   U-
  R                  US5      X'   [        R                  " X   SS9X'   MO     US   $ )z
Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music
tokens.
r   r   r   r   )r   )r\   rE  r=   r   r  r>   r7   r?   rG  r   r   )r   rG   rn   dimsr   
bins_shifts         r1   prior_postprocessJukeboxPrior.prior_postprocess  s     \\!_
!!!$fll1o8I8I!8L&LMekk&A67 s6{#AT11!45JZ/55j"EFIFI15FI $
 bzr3   c                     USU R                   S-    nSn[        [        [        XR                  /5      5      5       H  u  p4U" X25      nM     U$ )zZ
Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
Nr   )rA  rh   r=   rc   rB  )r   ra  r  r^  conditioner_blocks        r1   r  JukeboxPrior.embed_tokens  s[     00E$//A2EF!4<T#FX[r[rZsBt=u4v0!23D!Y 5w!!r3   c                     Uc  U R                   nUc  U R                  n[        R                  " 5          U R	                  XX4S9nSSS5        U$ ! , (       d  f       W$ = f)zY
Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states.
Nr  r  r  )rm   r[   r   rD  r=  )r   r   r  r  r  rK  s         r1   ru  JukeboxPrior.encode  sb     **KI]]_ ..) / M  	 _    A
Ac                     Uc  U R                   nUc  U R                  n[        R                  " 5          U R	                  XX4S9nSSS5        U$ ! , (       d  f       W$ = f)z;
Usamples the sequence of codebook vectors to a raw audio.
Nrq  )rm   r[   r   rD  r>  )r   ri   r  r  r  r  s         r1   ry  JukeboxPrior.decode  s`     **KI]]_'' ( F  	 _ rs  c                    Ub2  UR                   S   U R                  -
  nUSS2SU24   USS2US24   pBOSu  p$U R                  (       a  U R                  U5      OSu  pVU R                  (       a  U R                  U5      OUnXuU4$ )z{
Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens
can be None.
Nr   )NN)r\   r#  r  rC  r  r  )r   ra  rv   n_labelslyric_tokensr  metadata_posr  s           r1   get_condJukeboxPrior.get_cond  s    
 ~~a(4+H+HHH%-a(l%;Xal=Sl%/"H151K1KD##H-Q] 	, GKF]F]T../ABco!,FFr3   c
                    USL =(       d    UR                   S   S:H  n
SSS.U
   n[        R                  U SU SU S	U S
U 3	5        [        R                  " 5          U R                  X45      u  pnU R                  (       az  U
(       a  U R                  U/SU/5      u  pOU R                  X/SU/5      u  pU	b  XR                  -  n	U R                  R                  UUUUUUUUU	S9	nU R                  U5      nOZU R                  USS9nU
(       a!  U R                  R                  UUUUUUUU	S9nO"U R                  R                  UUUUUUUUUU	S9
nSSS5        U$ ! , (       d  f       U$ = f)ah  
Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.

Args:
    n_samples (`int`):
        Number of samples to generate.
    music_tokens (`list[torch.LongTensor]`, *optional*):
        Previously generated tokens at the current level. Used as context for the generation.
    music_tokens_conds (`list[torch.FloatTensor]`, *optional*):
        Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
        conditioned on the upper-level tokens.
    metadata (`list[torch.LongTensor]`, *optional*):
        List containing the metadata tensor with the artist, genre and the lyric tokens.
    temp (`float`, *optional*, defaults to 1.0):
        Sampling temperature.
    top_k (`int`, *optional*, defaults to 0):
        Top k probabilities used for filtering.
    top_p (`float`, *optional*, defaults to 0.0):
        Top p probabilities used for filtering.
    chunk_size (`int`, *optional*):
        Size of the chunks used to prepare the cache of the transformer.
    sample_tokens (`int`, *optional*):
        Number of tokens to sample.

Nr   r   	AncestralPrimed)TFz
 sampling z samples with temp=z, top_k=z, top_p=)r  r)   r*   r  r  T)r   )r  r)   r*   r  )r\   rQ  rR  r   rD  rz  rD  rf  r#  rk   r  rk  get_encoder_statesr   )r   r  ri   ra  rv   r  r)   r*   r  r  no_past_contextnamer  r  rx  r  r]  s                    r1   r   JukeboxPrior.sample	  s   J '$.L,2D2DQ2G12L!(3ODtfJyk1DTF(SXRYYabgahij]]_FJmmTfFqC|&&"AEAVAV%/A(BB>*,> BFAVAV%4t=O6PB>* !,!%B%BBM#zz77*&))"/  8 
   $55lC-1-D-D\Z^-D-_*"#'::#4#4!*-2!##&3 $5 	$L $(::#;#;!$*-2!###-&3 $< $LQ h i _h s   C:E""
E1c                 F   U R                   S:w  a  U R                  (       a}  U(       a*  U R                  R                  UR                  5      U l        U R                  USSS5      nU R                  R                  U5      nU R                  R                  U5      nU$ SnU$ )z
Retrieve the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
the lyric encoder.
r   N)r#  r?  r  r;   r<   r  rL  )r   rx  r   
lyric_actsr]  s        r1   r  JukeboxPrior.get_encoder_statesh  s    
 ((A-$2I2I#|||/B/BClD$EJ--j9J)-)F)Fz)R& *) *.&))r3   c                 `   U R                   (       a}  U R                  R                  U5      n[        R                  R                  UR                  SU R                  5      UR                  S5      5      [        R                  " S5      -  nU$ [        R                  " SUR                  S9nU$ )zG
Computes the loss for the lyric encoder: next lyric token prediction.
r   r5   r   r   )r?  r  r9  r   r   cross_entropyr   rK  rg   rG  r   tensorr<   )r   r]  target_lyricsencoder_losss       r1   get_encoder_lossJukeboxPrior.get_encoder_lossw  s     "")-)=)=>X)Y&==66*//D4D4DE}GYGYZ\G]sL
  !<<4N4U4UVLr3   c                    U(       a%  U R                   R                  R                  U5        U R                  X#5      u  pgnU R                  (       a/  U R                  X/SU/5      u  pU R                  XUSUS9u  u  pnO7U R                  U5      nU R                  X5      n
U R                  UUUUUS9u  pU R                  U
-  U R                  -  U R                  -  nXU R                  -  U R                  -  -  nUR                  5       R                  5       U
R                  5       R                  5       UR                  5       R                  5       S.nU(       a!  UR                  5       R                  5       US'   U(       aG  U R                   R                  R                  nU R                   R                  R                  S5        U$ X4$ )z
Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
vqvae's encoding layers.
NT)r  r  )r  )bpdr  next_token_prediction_lossr  F)rk   r  r  rz  rD  rf  r  r  r@  r#  rN  rM  r~  r   r  )r   ri   ra  rv   r  rW   r  r  rx  rG   r  r  r  r]  r  r  r  s                    r1   rd   JukeboxPrior.forward_tokens  s    JJ""223CDBF--PbBm?<"")-)>)>,t5G.H*&F AE

,APT`i AK A=6\ *.)@)@)N&001KZL04

"%*# 1; 1-& ))L84;X;XX[_[o[ooT-Q-QQTXThThhh .446<<>(//1779*D*K*K*M*S*S*U

 $||~335GG!%!7!7!J!JJJ""2259%%= r3   r   rv   ry  r  r  c                     UR                   S   nU R                  XS9tpgU R                  UUUUS9u  pU(       a  U R                  U/UQ5      n
OSn
XU	4$ )a  
Encode the hidden states using the `vqvae` encoder, and then predicts the next token in the `forward_tokens`
function. The loss is the sum of the `encoder` loss and the `decoder` loss.

Args:
    hidden_states (`torch.Tensor`):
        Hidden states which should be raw audio
    metadata (`list[torch.LongTensor]`, *optional*):
        List containing the metadata conditioning tensor with the lyric and the metadata tokens.
    decode (`bool`, *optional*, defaults to `False`):
        Whether or not to decode the encoded to tokens.
    get_preds (`bool`, *optional*, defaults to `False`):
        Whether or not to return the actual predictions of the model.
r   )r  )ri   ra  rv   r  N)r\   ru  rd   ry  )r   r   rv   ry  r  rn   ri   ra  r  r  r^  s              r1   r   JukeboxPrior.forward  sx    * #((+
,0KKK,\)++%1	 , 
 !%l-P=O-P!Q!%!00r3   )r  r  rO  rA  rB  r  rG  r  rK  r@  rJ  rE  rD  rm   r[   rI  r?  r  rC  rN   r#  rM  rk   rP  ra   rN  r>  r=  r  )NrZ   NNr   )NNr   )NNNr   r   r   NN)FF)r   r   r   r   r  r   r  r  r   r`   rV  rb  rf  rk  r  ru  ry  rz  r   r  r  rd   r   r  r   r=   
LongTensorr&   r   r   r   r   s   @r1   r7  r7    s    . %6W
1 W
 W
r" 2" A"$"G& ]~* 02DEdi+!b "'$)!1||!1 4 0 012!1 	!1
 D>!1 
ell	!1 !1r3   r7  c                   F   ^  \ rS rSr% Sr\\S'   SrSrS r	U 4S jr
SrU =r$ )	JukeboxPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
rl   jukeboxFc                 r    [        U[        [        45      (       a  UR                  UR                  5        g g r   )r  r7  r  applyr  r  s     r1   r  $JukeboxPreTrainedModel._init_weights  s+    f|\:;;LL--. <r3   c                 &   > [         TU ]  " U0 UD6  g r   )r   r   )r   inputskwargsr   s      r1   r   JukeboxPreTrainedModel.__init__  s    &+F+r3   r}  )r   r   r   r   r  r   r  r  supports_gradient_checkpointingr  r   r   r   r   s   @r1   r  r    s+    
 !&+#/, ,r3   r  a"  
            labels (`list[torch.LongTensor]` of length `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_length)` :
                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
                condition the generation.
            sampling_kwargs (`dict[Any]`):
                Various additional sampling arguments that are used by the `_sample` function. A detail list of the
                arguments can bee seen in the [`_sample`] function documentation.
ao  The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
    `continue_sample` and `ancestral_sample`. It does not have a `forward` method as the training is not end to end. If
    you want to fine-tune the model, it is recommended to use the `JukeboxPrior` class and train each prior
    individually.
    c                     ^  \ rS rSrS/rU 4S jrS rSS jrSS jrS r	S r
S	 rS
 r\R                  " 5                  SS\\R                      4S jj5       r\" S5      SS\\R                      4S jj5       r\" S\5      S\\R                      4S j5       r\" S\5      S\\R                      4S j5       r\" S\5      S\\R                      4S j5       rSrU =r$ )JukeboxModeli  r  c           	      4  > [         TU ]  U5        UR                  n[        U5      U l        U R                  U5        [        R                  " [        UR                  5       Vs/ s H  n[        UR                  U   U5      PM     sn5      U l        g s  snf r   )r   r   vqvae_configr  r  set_shared_paramsr   r   r>   rS  r7  prior_configspriors)r   rl   r  rm   r   s       r1   r   JukeboxModel.__init__  sz     **!,/
v&mmKPQWQaQaKbcKb%\&..u5u=Kbc
cs   %#Bc                     UR                    Hi  nUR                  Ul        UR                  Ul        UR                  Ul        UR                  Ul        UR
                  Ul        UR                  Ul        Mk     g)z
Initialises the parameters that are shared. This has to be done here because the list of `JukeboxPriorConfig`
is nest, and is thus unreachable in the `from_dict` function
N)r  r  r  r%  r&  r!  r  )r   model_configrl   s      r1   r  JukeboxModel.set_shared_params	  sh    
 #00F#/#=#=F !-!9!9F".";";F".";";F#/#=#=F +7+M+MF( 1r3   c                 :    U R                   R                  XX45      $ r   )r  ry  )r   ri   r  r  r  s        r1   ry  JukeboxModel.decode	  s    zz  IQQr3   c                 :    U R                   R                  XX45      $ r   )r  ru  )r   r  r  r  r  s        r1   ru  JukeboxModel.encode	  s    zz  9PPr3   c                 D   X#-   S-
  U-  n[        U[        R                  5      (       a  [        R                  " XSS9$ [        U[        5      (       a5  [	        [        U Vs/ s H  n[        R                  " XSSS9PM     sn6 5      $ Uc  S /U-  $ [        S5      es  snf )Nr   r   r   zUnknown input type)r  r   r  r  r=   rc   rW  )r   objr  
split_sizer  r   s         r1   split_batchJukeboxModel.split_batch	  s    *Q.:=c5<<((;;sA66T""sSstekk$BsSTUU[6H$$011	 Ts   "Bc           	          U R                   U   nX   n	UR                  n
U	R                  S   nXU-
  :  a
  X-   US'   SnOXS'   X-
  U-   nU R                  XX4X\U5      $ )Nr   r  r   )r  rN   r\   sample_single_window)r   ri   rj   rE   sampling_kwargsrm   tokens_to_samplemax_batch_sizerk   r  rN   nb_sampled_tokensrQ   s                r1   sample_partial_window"JukeboxModel.sample_partial_window$	  s     E"%,*003'777/@/SOO,E/4O,%-0@@E((vX]ftuur3   c                    U R                   U   nUS   R                  S   n	UR                  n
Xj-   nX   S S 2Xk24   nUR                  SS 5      nSU;   a  X-
  nUR                  S   nXR                  S   -
  n[        R                  SU SU SXm-    SU S3	5        US::  a  U$ UR                  XU5      nUR                  X&U R                  U5      nU R                  XU5      nU R                  UX5      nU R                  UX5      n/ n[        [        UUU5      S	S
9nU H  u  nnnSS/UR                  S   S:H     nUR                  SU SU SU SU R                  UR                  -   3SS9  UR                  " SUR                  S   UUUS.UD6nUR                  U5        M     [         R"                  " USS9nUS S 2U* S 24   n[         R"                  " X   U/SS9X'   U$ )Nr   r  r   r  z tokens for [r  z]. Conditioning on z tokensFr  r}  r~  z[prior level z] z
 Sampling z tokens out of Tr  )r  ri   ra  rv   r   r}  )r  r\   rN   getrQ  rR  rb  r`   rD   r  r   rc   r  rP  r   rM   r   r8   )r   ri   rj   rE   r  rm   rQ   r  rk   r  rN   ru   previous_sampled_tokensr  conditioning_tokens
new_tokensra  rv   r  music_tokens_conds_listmetadata_listrG   r	  r  music_tokens_conds_ir|   r  r{   r  music_tokens_news                                 r1   r  !JukeboxModel.sample_single_window5	  sg   E" O))!,	m"."5al"C'++OTBo-KM5;;A>"%B%B1%EE
mE7!E<Q;R S#$G-	

 ? #99,sS %%fT5F5FO ,,-DQ_`"&"2"23Ey"a((9M-/FV^cd@H<N0**>+?+?+Ba+GHD$$wbj H%%)<)<<=? % 
 || (..q1+#7#	
 "H MM(# AI 6q1 *!j[\/:#ii)<>N(OUVWr3   c	           
          X`R                   U   R                  :  aB  [        X`R                   U   R                  U5      n	U	 H  n
U R                  XX4XZU5      nM     U$ U R	                  XX4XVU5      nU$ r   )r  rN   rR   r  r  )r   ri   rj   rE   r  rm   rD   rO   r  r	  rQ   s              r1   sample_levelJukeboxModel.sample_levelp	  s     ;;u-333!,E0B0H0H*UH!#88 &5Q_  "   55fuTbL r3   r  c                    U R                   S   nUb  UnO;[        XR                  R                  -  5      UR                  -  UR                  -  nUc  [        [        U R                   5      5      nUU l        U GH7  nU[        U R                   5      S-
  :X  a  SOUUUS.nUU R                   U   R                  -  n[        U R                  R                  U   U R                   U   R                  -  5      nUU:w  a  UOUnU R                  UUU   UUUUUU5      nU(       d  M  U R                  R                  UU   R                  5        [        R                  " 5          [        U R                   5      U-
  S-
  nU R                  R!                  USUS-    UUU   R"                  S   S9nSSS5        SU 3n[$        R&                  R)                  U5      (       d  [$        R*                  " U5        [-        UUUWR/                  5       S9  U
(       d  GM  U R                   S   c  GM  U R                   S   R0                  S:  d  GM  [        R                  " 5          [3        XS   U R                   S   U R                  5      nSSS5        [        R4                  " S	W0U S
35        GM:     U$ ! , (       d  f       GN= f! , (       d  f       NC= f)aq  
Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving
the generated raw audio at each step.

Args:
    music_tokens (`list[torch.LongTensor]`):
        A sequence of music tokens of length `self.levels` which will be used as context to continue the
        sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain
        level.
    labels (`list[torch.LongTensor]`):
        List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
        lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
        which are used to condition the generation.
    sample_levels (`list[int]`):
        List of the desired levels at which the sampling will be done. A level is equivalent to the index of
        the prior in the list of priors
    metas (`list[Any]`, *optional*):
        Metadatas used to generate the `labels`
    chunk_size (`int`, *optional*, defaults to 32):
        Size of a chunk of audio, used to fill up the memory in chunks to prevent OOM errors. Bigger chunks
        means faster memory filling but more consumption.
    sampling_temperature (`float`, *optional*, defaults to 0.98):
        Temperature used to adjust the randomness of the sampling.
    lower_batch_size (`int`, *optional*, defaults to 16):
        Maximum batch size for the lower level priors
    max_batch_size (`int`, *optional*, defaults to 16):
        Maximum batch size for the top level priors
    sample_length_in_seconds (`int`, *optional*, defaults to 24):
        Desired length of the generation in seconds
    compute_alignments (`bool`, *optional*, defaults to `False`):
        Whether or not to compute the alignment between the lyrics and the audio using the top_prior
    sample_tokens (`int`, *optional*):
        Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy
        experiments
    offset (`int`, *optional*, defaults to 0):
        Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is
        greater than 0, the lyrics will be shifted take that intoaccount
    save_results (`bool`, *optional*, defaults to `True`):
        Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
        time.
    sample_length (`int`, *optional*):
        Desired length of the generation in samples.

Returns: torch.Tensor

Example:

```python
>>> from transformers import AutoTokenizer, JukeboxModel, set_seed
>>> import torch

>>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
>>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()

>>> labels = tokenizer(**metas)["input_ids"]
>>> set_seed(0)
>>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
>>> zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
>>> zs[0]
tensor([[1853, 1369, 1150, 1869, 1379, 1789,  519,  710, 1306, 1100, 1229,  519,
      353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
     1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
     1804,  541, 1804, 1434]])
```
r   Nr   gGz?)r  r  r  )r  r  zjukebox/level_)r   r   r   z/lyric_alignments.pt)r  r?   rl   r  rP  r>   r7   rD   r]   rN   r  r  r;   r<   r   rD  ry  r\   osr   existsmakedirsr   re   r#  r   r   )r   ri   rj   sample_levelsr   r  sampling_temperaturelower_batch_sizer  r  compute_alignmentsr  rE   save_resultsra   	top_priorrD   rm   r  total_token_to_samplerO   r  r  logdirr   s                            r1   _sampleJukeboxModel._sample	  s   j KKN	$(L ,{{/H/HHIYMdMdd''(L  !#dkk"23M )"E %T[[)9A)= =CW(!.O %1DKK4F4T4T$T!T[[55e<t{{5?Q?W?WWXJ16-1G-^N,,u%	L |

l51889]]_"%dkk"2U":Q">K $

 1 1$[uqy1{VbchViVoVopqVr !2 !I %
 *%1ww~~f--KK'U	@QR%%$++a.*DUVIpIpstIt%2<DKKXYN\`\g\g%h
 )JJj9fXEY;Z[K #N  %_ )s   &AJ9&(K9
K	
K	a  
        Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically
        upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
        the VQ-VAE decoder to convert the music tokens to raw audio.

        Args:
            labels (`list[torch.LongTensor]`) :
                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
                which are used to condition the generation.
            n_samples (`int`, *optional*, default to 1) :
                Number of samples to be generated in parallel.
        c           
      b   UR                  S[        [        [        U R                  5      5      5      5      n[        [        U R                  5      5       Vs/ s H5  n[
        R                  " US[
        R                  US   R                  S9PM7     nnU R                  " XaU40 UD6nU$ s  snf )a  
Example:

```python
>>> from transformers import AutoTokenizer, JukeboxModel, set_seed

>>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
>>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")

>>> lyrics = "Hey, are you awake? Can you talk to me?"
>>> artist = "Zac Brown Band"
>>> genre = "Country"
>>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
>>> set_seed(0)
>>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)

>>> with torch.no_grad():
...     model.decode(music_tokens)[:, :10].squeeze(-1)
tensor([[-0.0219, -0.0679, -0.1050, -0.1203, -0.1271, -0.0936, -0.0396, -0.0405,
    -0.0818, -0.0697]])
```
r  r   rT   )
popr=   r>   r7   r  r   r9   r:   r<   r  )r   rj   r  r  r  rs  ri   s          r1   ancestral_sampleJukeboxModel.ancestral_sample
  s    N (++OT%DKKHXBY=Z[Z_`cdhdodo`pZq
ZqUVEKK	1EJJvay?O?OPZq 	 
 ||L-[?[	
s   <B,az  Generates a continuation of the previously generated tokens.

        Args:
            music_tokens (`list[torch.LongTensor]` of length `self.levels` ) :
                A sequence of music tokens which will be used as context to continue the sampling process. Should have
                `self.levels` tensors, each corresponding to the generation at a certain level.
        c           
          UR                  S[        [        [        U R                  5      5      5      5      nU R
                  " XU40 UD6nU$ )Nr  r  r=   r>   r7   r  r  r   ri   rj   r  r  s        r1   continue_sampleJukeboxModel.continue_sample9
  sD     (++OT%DKKHXBY=Z[||L-[?[r3   a  Upsamples a sequence of music tokens using the prior at level `level`.

        Args:
            music_tokens (`list[torch.LongTensor]` of length `self.levels` ) :
                A sequence of music tokens which will be used as context to continue the sampling process. Should have
                `self.levels` tensors, each corresponding to the generation at a certain level.
        c           
          UR                  S[        [        [        U R                  5      S-
  5      5      5      nU R
                  " XU40 UD6nU$ )Nr  r   r  r  s        r1   upsampleJukeboxModel.upsampleH
  sJ     (++OT%DKKHX[\H\B]=^_||L-[?[r3   a'  Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
        used: as conditioning for each level, which means that no ancestral sampling is required.

        Args:
            raw_audio (`list[torch.Tensor]` of length `n_samples` ) :
                A list of raw audio that will be used as conditioning information for each samples that will be
                generated.
        c           
         UR                  S[        [        [        U R                  5      5      5      5      nU R
                  R                  UR                  5      R                  5         [        R                  " 5          U R
                  R                  US[        U R                  5      UR                  S   S9nS S S 5        U R                  " WX$40 UD6nU$ ! , (       d  f       N$= f)Nr  r   rq  )r  r=   r>   r7   r  r  r;   r<   re   r   rD  ru  r\   r  )r   r  rj   r  r  ri   s         r1   r  JukeboxModel.primed_sampleW
  s     (++OT%DKKHXBY=Z[

i&&'--/]]_::,,qC4DPYP_P_`aPb - L  ||L&[?[ _s    =C
C))r  rD   r  r  )N    g\(\?   r     FNr   TN)r   )r   r   r   r   _no_split_modulesr   r  ry  ru  r  r  r  r  r   rD  r=   r  r  r	   r   JUKEBOX_SAMPLING_INPUT_DOCSTRINGr  r  r  r   r   r   s   @r1   r  r    s`    ((
NRQ	2v"8v  ]]_ !!# H  
e	!H HT 	$uO_O_J` > 	 	)	$uO_O_J` 	
 	 	)	4HXHXCY 	
 	 	)T%JZJZE[ r3   r  )r  r  r  r7  )Br  r   r  typingr   rf   rg   r   torch.nn.functionalr   r   r#   torch.nnr   FusedLayerNormr  r   modeling_utilsr   utilsr	   r
   utils.loggingr   configuration_jukeboxr   r   r   r   
get_loggerr   rQ  re   r2   rJ   rR   r   r   r   Moduler   r   r   r   r  r  r  r$  r  JUKEBOX_START_DOCSTRINGr  r  r  r	  r  r  r  r  r  r  r  r7  r  r  r  __all__r}  r3   r1   <module>r     sG     	       0 " . 3 " l l 
		H	% !"E%L= #L,D6r=;6BII *:BII :*bii *bii (!RYY !4RYY 4RYY 6[jRYY [j|'F		 'FT "  v(? v(v(r (
9~ 
9sryy sl299 49(		 9(x G ryy G T
*299 *Z-BII -`5"bii 5"pk1? k1\,_ ,$$   
 r) rrj Ur3   