
    cCi^                        S r SSKrSSKJr  SSKJrJr  SSKrSSKJ	r	  SSK
J	s  Jr  SSKJr  SSKJrJr  SS	KJr  S
SKJr  \ " S S\5      5       r\ " S S\5      5       r\ " S S\5      5       r " S S\	R2                  5      r " S S\	R2                  5      r " S S\	R2                  5      r " S S\	R2                  5      r " S S\	R2                  5      r " S S\	R2                  5      r " S S\	R2                  5      r  " S  S!\	R2                  5      r!\ " S" S#\5      5       r"\" S$S%9 " S& S'\"5      5       r#S'S#/r$g)(zTransformers Xcodec model.    N)	dataclass)OptionalUnion   )PreTrainedAudioTokenizerBase)ModelOutputauto_docstring   )	AutoModel   )XcodecConfigc                   j    \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Srg)XcodecOutput   aW  
Args:
    audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
        Discrete code indices computed using `model.encode`.
    audio_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`, *optional*)
        Decoded audio values obtained using the decoder part of Xcodec.
Naudio_codesaudio_values )__name__
__module____qualname____firstlineno____doc__r   r   torch
LongTensor__annotations__r   FloatTensor__static_attributes__r       d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/xcodec/modeling_xcodec.pyr   r      s3     /3K%**+204L(5,,-4r   r   c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)XcodecEncoderOutput-   z
Args:
    audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
        Discrete code indices computed using `model.encode`.
Nr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r!   -   s     /3K%**+2r   r!   c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)XcodecDecoderOutput8   z
Args:
    audio_values (`torch.FloatTensor`  of shape `(batch_size, channels, num_samples)`, *optional*):
        Decoded audio values obtained using the decoder part of Xcodec.
Nr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r$   r$   8   s     15L(5,,-4r   r$   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )ResidualUnitC   zFResidual block for SemanticEncoder and SemanticDecoder used in Xcodec.configin_channelsout_channelsdilationc                 
  > [         TU ]  5         [        R                  " 5       U l        UR
                  S-
  S-  U-  n[        R                  " UUUR
                  SUUSSS9U l        [        R                  " X3SSS9U l        g )Nr   r
   F)stridepaddingr,   groupsbias)r*   r+   kernel_sizer1   )	super__init__nnELU
activationunit_kernel_sizeConv1dconv1conv2)selfr)   r*   r+   r,   r/   	__class__s         r   r4   ResidualUnit.__init__F   s{    &&(++a/A5AYY##	

 YY<`ahmn
r   hidden_statereturnc                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nX-   $ Nr7   r:   r;   )r<   r?   output_tensors      r   forwardResidualUnit.forwardV   sC    5

=16

=1++r   rC   )r   r   r   r   r   r   intr4   r   TensorrE   r   __classcell__r=   s   @r   r'   r'   C   sM    Po| o# oS o\_ o ,ELL ,U\\ , ,r   r'   c                   v   ^  \ rS rSrS\S\S\S\4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )SemanticEncoderBlock^   r)   r*   r+   r.   c                   > [         TU ]  5         [        R                  " UR                   Vs/ s H  n[        XX%5      PM     sn5      U l        US:X  a  SOSU-  nUS-
  S-  n[        R                  " X#XdUSS9U l        g s  snf )Nr   r   r
   Tr2   r.   r/   r1   )	r3   r4   r5   
ModuleListblock_dilationsr'   	res_unitsr9   conv)	r<   r)   r*   r+   r.   r,   kernelr/   r=   s	           r   r4   SemanticEncoderBlock.__init___   s~    V\VlVlmVl(\&{EVlm

 kF
A:!#IIkVdkrvw	 ns   Br?   r@   c                 ^    U R                    H  nU" U5      nM     U R                  U5      nU$ rB   )rR   rS   r<   r?   units      r   rE   SemanticEncoderBlock.forwardj   s.    NND-L #yy.r   rS   rR   r   r   r   r   r   rG   r4   r   rH   rE   r   rI   rJ   s   @r   rL   rL   ^   sJ    	x| 	x# 	xS 	xZ] 	xELL U\\  r   rL   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )SemanticEncoderq   c           	        > [         TU ]  5         [        UR                  5      [        UR                  5      :w  a  [        S5      e[        R                  " UR                  UR                  UR                  SUR                  S-  SS9U l
        UR                  n/ n[        UR                  5       H<  u  pE[        UR                  UR                  U   -  5      nU[        XXe5      /-  nUnM>     [        R                  " U5      U l        g )Nz:Number of strides must match the number of channel_ratios.r   r
   Fr1   )r3   r4   lenstrideschannel_ratios
ValueErrorr5   r9   semantic_hidden_sizer2   rS   	enumeraterG   rL   rP   conv_blocks)r<   r)   r*   rg   ir.   r+   r=   s          r   r4   SemanticEncoder.__init__r   s    v~~#f&;&;"<<YZZII''''!#
	 11"6>>2IAv::V=R=RST=UUVL0l[\\K&K 3
 ==5r   r?   r@   c                 ^    U R                  U5      nU R                   H  nU" U5      nM     U$ rB   rS   rg   r<   r?   blocks      r   rE   SemanticEncoder.forward   s0    yy.%%E .L &r   rk   
r   r   r   r   r4   r   rH   rE   r   rI   rJ   s   @r   r]   r]   q   s(    6,ELL U\\  r   r]   c                   v   ^  \ rS rSrS\S\S\S\4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )SemanticDecoderBlock   r)   r*   r+   r.   c                 b  > [         T	U ]  5         US:X  a  [        R                  " UUSSSSS9U l        O6SU-  nUS-   S-  nUS-  S:X  a  SOSn[        R
                  " X#XTXgSS9U l        [        R                  " UR                   Vs/ s H  n[        XX85      PM     sn5      U l	        g s  snf )	Nr   r   TrO   r
   r   Fr`   )
r3   r4   r5   r9   rS   ConvTranspose1drP   rQ   r'   rR   )
r<   r)   r*   r+   r.   r2   r/   output_paddingr,   r=   s
            r   r4   SemanticDecoderBlock.__init__   s    Q;		DI f*Kza'G"(1*/QqN**;^cDI X^XnXnoXnH\&GXno
os   	B,r?   r@   c                 ^    U R                  U5      nU R                   H  nU" U5      nM     U$ rB   rZ   rW   s      r   rE   SemanticDecoderBlock.forward   s.    yy.NND-L #r   rZ   r[   rJ   s   @r   rq   rq      sE    
| 
# 
S 
Z] 
.ELL U\\  r   rq   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )SemanticDecoder   c           	        > [         TU ]  5         [        R                  " UR                  [        UR                  UR                  S   -  5      UR                  SUR                  S-  SS9U l        / n[        UR                  5       H  u  p4[        UR                  UR                  U   -  5      nU[        UR                  5      S-
  :  a)  [        UR                  UR                  US-      -  5      nOUR                  nU[        XXd5      /-  nM     [        R                  " U5      U l        [        R                  " UR                  UR                  UR                  SUR                  S-  SS9U l        g )Nr   r   r
   F)r*   r+   r2   r.   r/   r1   )r.   r/   r1   )r3   r4   r5   r9   re   rG   rc   r2   r:   rf   rb   ra   rq   rP   rg   r;   )r<   r)   rg   rh   r.   r*   r+   r=   s          r   r4   SemanticDecoder.__init__   sQ   YY33V886;P;PQR;SST**&&!+

 "6>>2IAf99F<Q<QRS<TTUKC--.23"6#>#>AVAVWX[\W\A]#]^%::0l[\\K 3 ==5YY''''&&!+

r   r?   r@   c                     U R                  U5      nU R                   H  nU" U5      nM     U R                  U5      nU$ rB   )r:   rg   r;   rl   s      r   rE   SemanticDecoder.forward   s>    zz,/%%E .L &zz,/r   )r:   r;   rg   ro   rJ   s   @r   rz   rz      s(    
>ELL U\\  r   rz   c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )XcodecEuclideanCodebook   z!Codebook with Euclidean distance.c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      nUR                  U l        U R                  S[        R                  " S/5      5        U R                  S[        R                  " UR                  5      5        U R                  SU5        U R                  SUR                  5       5        g )NinitedTcluster_sizeembed	embed_avg)	r3   r4   r   zeroscodebook_sizecodebook_dimregister_bufferrH   clone)r<   r)   r   r=   s      r   r4    XcodecEuclideanCodebook.__init__   s    F00&2E2EF#11Xu||TF';<^U[[9M9M-NOWe,[%++-8r   c                    U R                   R                  5       nUR                  S5      R                  SSS9nUSU-  U-  -
  UR                  S5      R                  SSS9-   * nUR	                  SS9R
                  nU$ )Nr
   r   T)keepdimr   dim)r   tpowsummaxindices)r<   hidden_statesr   scaled_statesdist	embed_inds         r   quantize XcodecEuclideanCodebook.quantize   s    

%))!,00D0A]!2U!::UYYq\=M=MaY]=M=^^_HHH$,,	r   c                     UR                   nUR                  SUS   45      nU R                  U5      nUR                  " US S 6 nU$ )Nr   )shapereshaper   view)r<   r   r   r   s       r   encodeXcodecEuclideanCodebook.encode   sM    ##%--r59o>MM-0	NNE#2J/	r   c                 F    [         R                  " XR                  5      nU$ rB   )F	embeddingr   )r<   r   	quantizeds      r   decodeXcodecEuclideanCodebook.decode   s    KK	::6	r   )r   )r   r   r   r   r   r4   r   r   r   r   rI   rJ   s   @r   r   r      s    +9 r   r   c                   @   ^  \ rS rSrSrS\4U 4S jjrS rS rSr	U =r
$ )XcodecVectorQuantization   zQ
Vector quantization implementation. Currently supports only euclidean distance.
r)   c                 B   > [         TU ]  5         [        U5      U l        g rB   )r3   r4   r   codebook)r<   r)   r=   s     r   r4   !XcodecVectorQuantization.__init__   s    /7r   c                 b    UR                  SSS5      nU R                  R                  U5      nU$ Nr   r
   r   )permuter   r   )r<   r   embed_ins      r   r   XcodecVectorQuantization.encode   s/    %--aA6==''6r   c                 b    U R                   R                  U5      nUR                  SSS5      nU$ r   )r   r   r   )r<   r   r   s      r   r   XcodecVectorQuantization.decode  s/    ==''	2##Aq!,r   )r   )r   r   r   r   r   r   r4   r   r   r   rI   rJ   s   @r   r   r      s#    8| 8
 r   r   c                      ^  \ rS rSrSrS\4U 4S jjrS rSS\4S jjr	SS\
R                  S\
R                  4S	 jjrS
\
R                  S\
R                  4S jrSrU =r$ ) XcodecResidualVectorQuantizationi  zn
Residual vector quantization implementation. Follows Algorithm 1 in https://huggingface.co/papers/2107.03312
r)   c                 "  > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        UR                  U l        UR                  U l	        UR
                  U l        g s  snf rB   )
r3   r4   r5   rP   rangenum_quantizersr   
quantizers
frame_rater   )r<   r)   _r=   s      r   r4   )XcodecResidualVectorQuantization.__init__  sq    --SXY_YnYnSo(pSoa)A&)ISo(pq ++#11$33 )qs   Bc                 b    [         R                  " U R                  5      U R                  -  S-  $ )zReturn bandwidth per quantizer.i  )mathlog2r   r   )r<   s    r   get_bandwidth_per_quantizer<XcodecResidualVectorQuantization.get_bandwidth_per_quantizer  s%    yy++,t>EEr   r@   c           	          U R                  5       nU R                  nUb1  US:  a+  [        [        S[        R
                  " X-  5      5      5      nU$ )z:Return num_quantizers based on specified target bandwidth.        r   )r   r   rG   r   r   floor)r<   	bandwidthbw_per_qr   s       r    get_num_quantizers_for_bandwidthAXcodecResidualVectorQuantization.get_num_quantizers_for_bandwidth  sJ    335,, Y_ Q

93G(H!IJNr   
embeddingsc                     U R                  U5      nUn/ nU R                  SU  H:  nUR                  U5      nUR                  U5      nXH-
  nUR	                  U5        M<     [
        R                  " U5      n	U	$ )z
Encode the input tensor into discrete indices using RVQ, with the number of quantizers selected based on the given bandwidth.
Each quantizer /codebook residually quantizes the input and returns the nearest indices in terms of Euclidian distance.
N)r   r   r   r   appendr   stack)
r<   r   r   r   residualall_indices	quantizerr   r   out_indicess
             r   r   'XcodecResidualVectorQuantization.encode$  s    
 >>yI.9I&&x0G!((1I+Hw'	 :
 kk+.r   codesc                     [         R                  " SUR                  S9n[        U5       H)  u  p4U R                  U   nUR                  U5      nX&-   nM+     U$ )z9Decode the given codes to their quantized representation.r   )device)r   tensorr   rf   r   r   )r<   r   quantized_outrh   r   r   r   s          r   r   'XcodecResidualVectorQuantization.decode4  sU    S>#E*JA*I!((1I)5M + r   )r   r   r   r   rB   )r   r   r   r   r   r   r4   r   rG   r   r   rH   r   r   r   rI   rJ   s   @r   r   r     sa    4| 4F#  %,,  ELL U\\  r   r   c                   6    \ rS rSrSr\rSrSrS r	S r
S rSrg	)
XcodecPreTrainedModeli>  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
xcodecinput_valuesc                 l   [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  [        R                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        R                  5      (       a  [        R                  R                  UR                  5        UR                  bh  [         R"                  " UR$                  UR&                  UR(                  S   -  -  5      n[        R                  R+                  UR                  U* US9  ggUR,                  R.                  S:X  a&  UR0                  R                  R                  S5        g[        U[        R2                  5      (       a  UR5                  5         g[        U[        R6                  5      (       a%  UR                  R                  R                  SSS9  g[        U[8        5      (       Ga)  UR:                  R=                  5        Hv  n[        U[        R                  5      (       d  M$  [        R                  R?                  UR                  SS	9  [        R                  RA                  UR                  S5        Mx     URB                  R=                  5        Hv  n[        U[        R                  5      (       d  M$  [        R                  R?                  UR                  SS	9  [        R                  RA                  UR                  S5        Mx     gg)
zInitialize the weightsr   )meanstdNg      ?r   )abSnake1dg{Gz?)r   )"
isinstancer5   Linearweightdatanormal_r)   initializer_ranger1   zero_	LayerNorm	GroupNormfill_r9   initkaiming_normal_r   sqrtr0   r*   r2   uniform_r=   r   alphart   reset_parameters	EmbeddingXcodecModelacoustic_encodermodulestrunc_normal_	constant_acoustic_decoder)r<   modulek	submodules       r   _init_weights#XcodecPreTrainedModel._init_weightsI  sy   fbii((MM&&CT[[5R5R&S{{&  &&( 'r|| <==KK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' &&)3LL##C( 2 233##%--MM&&CT&:,, $44<<>	i33GG)))*:*:)EGG%%inna8 ? $44<<>	i33GG)))*:*:)EGG%%inna8 ? -r   c                    [         R                  R                  R                  n[	        [         R                  R                  R
                  S5      (       a.  [         R                  R                  R
                  R                  nU" U R                  R                  5        U" U R                  R                  5        U R                  R                   Hc  nU" UR                  5        UR                  UR                  UR                  4 H'  nU" UR                  5        U" UR                  5        M)     Me     U" U R                  R                  SS9  U" U R                  R                  SS9  U R                  R                   H`  nU" UR                  SS9  UR                  UR                  UR                  4 H%  nU" UR                  SS9  U" UR                  SS9  M'     Mb     g)znApply weight norm in the acoustic encoder and decoder because the original checkpoint has weight norm applied.weight_normr   nameN)r   r5   utilsr  hasattrparametrizationsr   r:   r;   rm   	res_unit1	res_unit2	res_unit3r   conv_t1)r<   r  rm   res_units       r   apply_weight_norm'XcodecPreTrainedModel.apply_weight_normi  sU   hhnn00588>>22MBB((..99EEKD))//0D))//0**00E$"__eoouOHNN+HNN+ P 1 	D))//h?D))//h?**00EH5"__eoouOHNN:HNN: P 1r   c                    U R                   U R                  4 H  nUR                  5        H  n [        R                  R
                  R                  USS9  [        US5      (       d  M?  SUR                  ;   d  MQ  [        R                  R
                  R                  R                  USSS9  M     M     g! [        [        4 a     Nsf = f)z=Remove the weight norm from the acoustic encoder and decoder.r   r  r  T)leave_parametrizedN)r   r   r   r   r5   r  remove_weight_normrd   AttributeErrorr  r  parametrizeremove_parametrizations)r<   r   ms      r   r  (XcodecPreTrainedModel.remove_weight_norm  s    ,,d.C.CDF^^%HHNN55ah5G 1011h!BTBT6THHNN..FFq(gkFl & E #N3 s   (B99CCr   N)r   r   r   r   r   r   config_classbase_model_prefixmain_input_namer   r  r  r   r   r   r   r   r   >  s*    
  L $O9@;0	mr   r   z$The Xcodec neural audio codec model.)custom_introc                     ^  \ rS rSrU 4S jr\S\R                  4S j5       rS\	R                  S\	R                  4S jr\  SS\	R                  S\\   S	\\   S\\	R                  \4   4S
 jj5       r\ SS\	R                  S	\\   S\\	R                  \4   4S jj5       r\   SS\	R                  S\\	R                     S\\   S	\\   S\\\	R                  \	R                  4   \4   4
S jj5       rSrU =r$ )r   i  c                 @  > [         TU ]  U5        Xl        UR                  S-  U l        [
        R                  " UR                  5      nUR                  U l	        UR                  U l        U R                  U R                  5        [        U5      U l        [        U5      U l        [
        R                  " UR"                  5      R%                  5       U l        [(        R*                  " UR,                  UR,                  5      U l        [(        R*                  " UR,                  UR"                  R,                  5      U l        [(        R*                  " UR,                  UR                  R,                  5      U l        [5        U5      U l        U R9                  5         g )Nr
   )r3   r4   r)   
hop_lengthpadr   from_configacoustic_model_configencoderr   decoderr   _adjust_dac_decoderr]   encoder_semanticrz   decoder_semanticsemantic_model_configevalsemantic_modelr5   r   hidden_sizefcfc1fc2r   r   	post_init)r<   r)   acoustic_modelr=   s      r   r4   XcodecModel.__init__  s'    $$)"..v/K/KL . 6 6 . 6 6  !6!67 / 7 / 7'33F4P4PQVVX))F..0B0BC99V//1M1M1Y1YZ99V//1M1M1Y1YZ9&A 	r   r"  c                    U R                  5        Hi  n[        U[        R                  5      (       d  M$  [        UR                  [
        5      (       a  UR                  S   OUR                  nUS-  4Ul        Mk     [        U S5      (       aE  [        U R                  [        R                  5      (       a  [        R                  " 5       U l        ggg)z
DAC implemented in Xcodec is slightly different from the HF version.
DAC in Xcodec adjusts the output padding in every ConvTranspose1d in the decoder and removes
the final `nn.Tanh` activation function.
r   r
   tanhN)r   r   r5   rt   r.   tupleru   r  r1  TanhIdentity)r"  r   r.   s      r   r#  XcodecModel._adjust_dac_decoder  s     oo'F&""4"455-7u-M-Mq)SYS`S`)/!% ( 7F##
7<<(I(I;;=GL )J#r   r   r@   c                 P   US S 2SS S 24   n[         R                  " XR                  U R                  45      n[        R                  " 5          U R	                  USS9nUR
                  nS S S 5        [        R                  " WSS9nUR                  SS9$ ! , (       d  f       N2= f)Nr   T)output_hidden_statesr   r   )r   r  r   no_gradr(  r   r   r   )r<   r   outputsr   stackeds        r   _extract_semantic_features&XcodecModel._extract_semantic_features  s    #Aq!G,uu\HHdhh+?@]]_)),T)RG#11M  ++m3|||"" _s   B
B%r   return_dictc           	         Ub  UOU R                   R                  nUR                  S   nUS:w  a  [        SU 35      eUc  U R                   R                  S   nO?X R                   R                  ;  a&  [        SU SU R                   R                   S35      eU R                  U5      R                  5       nU R                  UR                  SS5      5      nU R                  U5      nUR                  S   UR                  S   :w  aU  U R                  [        R                  " USS2S	SS24   U R                  U R                  45      R                  S5      5      n[        R                  " Xv/SS
9nU R                  UR                  SS5      5      R                  SS5      nU R                   R#                  X5      n	U	R                  S	S5      n	U(       d  U	$ [%        U	5      $ )a  
input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
    Float values of the input audio waveform.
bandwidth (`float`, *optional*):
    The target bandwidth in (kbps) supports only values in `config.target_bandwidths`.
    Defaults to the highest available bandwidth `4.0` kbps.
return_dict (`bool`, *optional*):
    Whether or not to return a [`~utils.ModelOutput`].

Returns:
    `torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)` containing the discrete encoded audio codes.
Nr   zAudio must be mono, but got r   z)This model doesn't support the bandwidth z. Select one of .r
   r   r   )r)   r=  r   rd   target_bandwidthsr;  detachr$  	transposer   r   r  	unsqueezer   catr*  r   r   r!   )
r<   r   r   r=  channelse_semantic_input
e_semantic
e_acousticr   r   s
             r   r   XcodecModel.encode  s   & &1%<k$++BYBY%%a(q=;H:FGG55b9Ikk;;;;I;FVW[WbWbWtWtVuuvw   ::<HOOQ**+;+E+Ea+KL
**<8
A*"2"21"55..quu\!Q'5JTXXW[W_W_L`/a/k/klm/noJYY
7Q?
WWZ11!Q78BB1aH
nn++JB!++Aq1";//r   r   c                 >   Ub  UOU R                   R                  nUR                  SS5      nU R                  R	                  U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU(       d  U$ [        U5      $ )al  
audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`):
    Discrete code indices computed using `model.encode`.
return_dict (`bool`, *optional*):
    Whether or not to return a [`~utils.ModelOutput`]

Returns:
    Decoded audio values of shape `(batch_size, channels, num_samples)` obtained using the decoder part of
    Xcodec.
r   r   r
   )r)   r=  rB  r   r   r,  r   r$   )r<   r   r=  r   quantized_acousticr   s         r   r   XcodecModel.decode  s      &1%<k$++BYBY!++Aq1NN))+6	!XXi&9&9!Q&?@JJ1aP,,-?@"<00r   c                     Ub  UOU R                   R                  nUR                  S   nUc  U R                  XSS9nU R	                  X$S9S   SSU24   nU(       d  X&4$ [        X&S9$ )a  
input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
    The raw float values of the input audio waveform.
audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`:
    Discrete code indices computed using `model.encode`.
bandwidth (`float`, *optional*):
    Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
bandwidth (`float`, *optional*):
    Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
return_dict (`bool`, *optional*):
    Whether to return a [`XcodecOutput`] instead of a plain tuple.

Returns:
    `XcodecOutput` or tuple `(audio_codes, audio_values)`:
    - `audio_codes` of shape `(batch_size, num_quantizers, codes_length)`: the quantized discrete codes.
    - `audio_values` of shape `(batch_size, channels, num_samples)`: the reconstructed audio waveform given the codes.

Example:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoFeatureExtractor, XcodecModel

>>> model_id = "hf-audio/xcodec-hubert-librispeech"
>>> model = XcodecModel.from_pretrained(model_id)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
>>> audio_sample = dataset[0]['audio']['array']

>>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")

>>> outputs = model(**inputs)
>>> audio_codes = outputs.audio_codes
>>> audio_values = outputs.audio_values
```
Nr   F)r=  r   .)r   r   )r)   r=  r   r   r   r   )r<   r   r   r   r=  lengthr   s          r   rE   XcodecModel.forward  s    \ &1%<k$++BYBY##B'++l5+QK{{;{HKCQXRXQXLY..OOr   )r   r   r)   r%  r$  r*  r+  r,  r  r   r(  )NNrB   )NNN)r   r   r   r   r4   staticmethodr5   Moduler#  r   r   r;  r	   rH   r   floatboolr   r!   r   r$   r   r2  r   rE   r   rI   rJ   s   @r   r   r     sw   & )RYY ) )#u7H7H #UM^M^ #  &*&*	/0ll/0 E?/0 d^	/0
 
u||00	1/0 /0b  '+1\\1 d^1 
u||00	1	1 16  /3%)&*8Pll8P ell+8P E?	8P
 d^8P 
uU\\5<</0,>	?8P 8Pr   r   )%r   r   dataclassesr   typingr   r   r   torch.nnr5   torch.nn.functional
functionalr   modeling_utilsr   r  r   r	   autor   configuration_xcodecr   r   r!   r$   rQ  r'   rL   r]   rq   rz   r   r   r   r   r   __all__r   r   r   <module>r]     sY   !  ! "     : 0  . 
5; 
5 
5 3+ 3 3 5+ 5 5,299 ,6299 &bii <299 >%bii %Pbii @ryy ,/ryy /d Km8 Km Km\ GHsP' sP IsPl 1
2r   