
    cCi                        S SK JrJr  S SKJr  S SKrS SKJr	  S SKr
S SKJrJrJr  S SKJrJr  S SKJr  S SKJr  S SKJrJr  S SKJr  S	S
KJrJrJrJrJrJrJ r J!r!J"r"  S	SK#J$r$J%r%J&r&J'r'  S	SK(J)r)J*r*J+r+  SSK,J-r-  \+R\                  " \/5      r0Sr1Sr2\Rf                  r3S r4Sr5Sr6 " S S\Rn                  5      r8 " S S\Rn                  5      r9 " S S\Rn                  5      r: " S S\Rn                  5      r; " S S\Rn                  5      r< " S S\Rn                  5      r= " S  S!\Rn                  5      r> " S" S#\Rn                  5      r? " S$ S%\Rn                  5      r@ " S& S'\Rn                  5      rA " S( S)\Rn                  5      rB " S* S+\Rn                  5      rC " S, S-\%5      rD " S. S/\Rn                  5      rE\)" S0\55       " S1 S2\D5      5       rF\&" \F\1\\25         " S3 S4\Rn                  5      rG\)" S5\55       " S6 S7\D5      5       rH\&" \H\1\\2S8S99   " S: S;\Rn                  5      rI\)" S<\55       " S= S>\D5      5       rJ\&" \J\1\!\25         " S? S@\Rn                  5      rK\)" SA\55       " SB SC\D5      5       rL\'" \L\6R                  SD5      5        \&" \L\1\\25         " SE SF\Rn                  5      rN\)" SG\55       " SH SI\D5      5       rO\&" \O\1\"\25         " SJ SK\Rn                  5      rP\)" SL\55       " SM SN\D5      5       rQ\&" \Q\1\ \25         " SO SP\Rn                  5      rR\)" SQ\55       " SR SS\D5      5       rS\&" \S\1\\25        / STQrTg)U    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )	-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )RobertaConfigzFacebookAI/roberta-baser!   c                    X:g  R                  S5      nUR                  S:  ac  UR                  SUR                  S   45      n[        R
                  " USS9R                  S5      U-  nUR                  U R                  5      nO'[        R
                  " USS9R                  S5      U-  nUR                  S5      U-   $ )a  
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    input_ids: jnp.ndarray
    padding_idx: int

Returns: jnp.ndarray
i4   r    axis)astypendimreshapeshapejnpcumsum)	input_idspadding_idxmaskincremental_indicess       k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/roberta/modeling_flax_roberta.py"create_position_ids_from_input_idsr3   4   s     $,,T2Dyy1}||RB01!jjA6==dCdJ199)//J!jjA6==dCdJ%%d+k99    a   

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   r    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S
S\4S jjrSrg	)FlaxRobertaEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9U R                  S9U l
        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9U R                  S9U l        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9U R                  S9U l        [         R                  " U R                  R                   U R                  S9U l        [         R"                  " U R                  R$                  S9U l        g )N)stddev)embedding_initr9   epsilonr9   rate)nnEmbedr8   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger9   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfs    r2   setupFlaxRobertaEmbeddings.setup   sJ   !xxKK""KK##66..55T[[=Z=Z5[**	 
 $&88KK//KK##66..55T[[=Z=Z5[**	$
  &(XXKK''KK##66..55T[[=Z=Z5[**	&
" dkk.H.HPTPZPZ[zzt{{'F'FGr4   deterministicc                    U R                  UR                  S5      5      nU R                  UR                  S5      5      nU R                  UR                  S5      5      nXh-   U-   n	U R	                  U	5      n	U R                  XS9n	U	$ )Nr#   rW   )rI   r(   rK   rM   rN   rR   )
rT   r.   token_type_idsposition_idsattention_maskrW   inputs_embedsposition_embedsrM   hidden_statess
             r2   __call__FlaxRobertaEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &=O }5]Pr4   )rN   rR   rK   rM   rI   NT)__name__
__module____qualname____firstlineno____doc__r!   __annotations__r,   float32r9   rU   boolr`   __static_attributes__ r4   r2   r6   r6      s5    Q{{E399"H,_c  r4   r6   c                       \ rS rSr% \\S'   Sr\\S'   \R                  r
\R                  \S'   S rS rS r\R                  S	 5       r    SS\\R$                     S\S\4S jjrSrg
)FlaxRobertaSelfAttention   r8   Fcausalr9   c                 ,   U R                   R                  U R                   R                  -  U l        U R                   R                  U R                   R                  -  S:w  a  [	        S5      e[
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        [
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        [
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        U R                  (       a9  [!        ["        R$                  " SU R                   R&                  4SS9SS9U l        g g )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads}r9   kernel_initr    rj   r9   )r8   rD   num_attention_headshead_dim
ValueErrorrA   Denser9   rE   rF   rG   rH   querykeyvaluerp   r	   r,   onesrJ   causal_maskrS   s    r2   rU   FlaxRobertaSelfAttention.setup   si   //4;;3R3RR;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ

 ;;/!T[[@@APX^ D r4   c                     UR                  UR                  S S U R                  R                  U R                  4-   5      $ Nr$   )r*   r+   r8   ru   rv   rT   r_   s     r2   _split_heads%FlaxRobertaSelfAttention._split_heads   s;    $$]%8%8!%<@_@_aeanan?o%oppr4   c                 n    UR                  UR                  S S U R                  R                  4-   5      $ r   )r*   r+   r8   rD   r   s     r2   _merge_heads%FlaxRobertaSelfAttention._merge_heads   s2    $$]%8%8!%<@W@W?Y%YZZr4   c                 (   U R                  SS5      nU R                  SS[        R                  UR                  UR
                  5      nU R                  SS[        R                  UR                  UR
                  5      nU R                  SSS 5      nU(       a  UR                  R                  Gt ppUR                  nS[        U	5      -  USS4-   n[        R                  " UR                  X5      n[        R                  " UR                  X.5      nXl        X'l        UR                  S   nUR                  U-   Ul        [        R                  " [        R                  " U
5      X-   :  [        U	5      SX4-   5      n[        UU5      nXU4$ )	a<  
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slightly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
cache
cached_keycached_valuecache_indexc                  H    [         R                  " S[         R                  S9$ )Nr   rt   )r,   arrayint32rl   r4   r2   <lambda>@FlaxRobertaSelfAttention._concatenate_to_cache.<locals>.<lambda>   s    CIIaWZW`W`Dar4   )r   r   r    )has_variablevariabler,   zerosr+   r9   r{   lenr   dynamic_update_slicebroadcast_toarangetupler   )rT   rz   r{   ry   r\   is_initializedr   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r2   _concatenate_to_cache.FlaxRobertaSelfAttention._concatenate_to_cache   s_    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G**:+;+;SJC,,\-?-?PE"!&(-A% + 1 14M MK''

:&)NNj!Q(A$NNH +8^DN>))r4   Nkey_value_states
init_cacheoutput_attentionsc                    US LnUR                   S   n	U R                  U5      n
U(       a#  U R                  U5      nU R                  U5      nO"U R                  U5      nU R                  U5      nU R	                  U
5      n
U R	                  U5      nU R	                  U5      nU R
                  (       a  U
R                   S   UR                   S   pU R                  SS5      (       a\  U R                  S   S   nU R                  S   S   R                   S   n[        R                  " U R                  SSUS4SSUU45      nOU R                  S S 2S S 2S U2S U24   n[        R                  " UU	4UR                   SS  -   5      nUbR  U R
                  (       aA  [        R                  " [        R                  " USS9WR                   5      n[        UU5      nO,U R
                  (       a  WnOUb  [        R                  " USS9nU R
                  (       a3  U R                  SS5      (       d  U(       a  U R                  XX5      u  pnUb  [        R                   " US:  [        R"                  " UR                   S5      R%                  U R&                  5      [        R"                  " UR                   [        R(                  " U R&                  5      R*                  5      R%                  U R&                  5      5      nOS nS nU(       d+  U R,                  R.                  S:  a  U R1                  S	5      n[3        U
UUUU R,                  R.                  S
UU R&                  S S9	nUb  [        R4                  " SUU5      n[        R4                  " SUU5      nUR7                  UR                   S S S-   5      nU(       a  UU4nU$ U4nU$ )Nr   r    r   r   r   )r&   g        rR   T)biasdropout_rngdropout_ratebroadcast_dropoutrW   r9   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdr$   )r%   )r+   ry   rz   r{   r   rp   r   	variablesr   dynamic_slicer}   r,   r   expand_dimsr   r   selectfullr(   r9   finfominr8   attention_probs_dropout_probmake_rngr   einsumr*   )rT   r_   r\   layer_head_maskr   r   rW   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthr}   attention_biasr   attn_weightsattn_outputoutputss                          r2   r`   !FlaxRobertaSelfAttention.__call__   si    .T9"((+
 zz-0"23J::&67L -0J::m4L((6&&z2
((6 ;;'3'9'9!'<j>N>Nq>Q*  ,77!^^G4]C
%)^^G%<\%J%P%PQR%S"!//$$q!Z&;aLRd=e #..q!]l]KZK/OP**;HYHYZ[Z\H]8]^K %$++ --coonS[.\^i^o^opN*>;GN[[(N' __^(KN ;;D--g|DD
7;7Q7Q,84Jn
 % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 &::&8,XLjj!8,U!))+*;*;BQ*?%*GH1B;- JUr4   )r}   rv   rz   ry   r{   NFTF)rc   rd   re   rf   r!   rh   rp   rj   r,   ri   r9   rU   r   r   rA   compactr   r   ndarrayr`   rk   rl   r4   r2   rn   rn      s    FD{{E399":q[ ZZ* *H 37 "'_
 #3;;/_ _  _ _r4   rn   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxRobertaSelfOutputib  r8   r9   c                    [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [         R                  " U R                  R                  U R                  S9U l
        [         R                  " U R                  R                  S9U l        g )Nrs   r9   r=   r?   )rA   rx   r8   rD   rE   rF   rG   rH   r9   denserN   rO   rP   rQ   rR   rS   s    r2   rU   FlaxRobertaSelfOutput.setupf  s    XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr4   rW   c                 l    U R                  U5      nU R                  XS9nU R                  X-   5      nU$ NrY   r   rR   rN   )rT   r_   input_tensorrW   s       r2   r`   FlaxRobertaSelfOutput.__call__o  s7    

=1]P}'CDr4   rN   r   rR   Nrb   rc   rd   re   rf   r!   rh   r,   ri   r9   rU   rj   r`   rk   rl   r4   r2   r   r   b  s1    {{E399"H4  r4   r   c                       \ rS rSr% \\S'   Sr\\S'   \R                  r
\R                  \S'   S r    SS\4S	 jjrS
rg)FlaxRobertaAttentioniw  r8   Frp   r9   c                     [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        g )Nrp   r9   rt   )rn   r8   rp   r9   rT   r   outputrS   s    r2   rU   FlaxRobertaAttention.setup|  s7    ,T[[TXT^T^_	+DKKtzzJr4   Nr   c           
      ~    U R                  UUUUUUUS9nUS   n	U R                  XUS9nU4n
U(       a  XS   4-  n
U
$ )N)r   r   r   rW   r   r   rY   r    )rT   r   )rT   r_   r\   r   r   r   rW   r   attn_outputsr   r   s              r2   r`   FlaxRobertaAttention.__call__  sh     yy+-!'/ ! 
 #1oKm\ "Q))Gr4   )r   rT   r   )rc   rd   re   rf   r!   rh   rp   rj   r,   ri   r9   rU   r`   rk   rl   r4   r2   r   r   w  sL    FD{{E399"K "'   r4   r   c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxRobertaIntermediatei  r8   r9   c                 0   [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [        U R                  R                     U l        g Nr   )rA   rx   r8   intermediate_sizerE   rF   rG   rH   r9   r   r   
hidden_act
activationrS   s    r2   rU   FlaxRobertaIntermediate.setup  s`    XXKK))++224;;3P3PQ**


 !!7!78r4   c                 J    U R                  U5      nU R                  U5      nU$ N)r   r   r   s     r2   r`    FlaxRobertaIntermediate.__call__  s$    

=16r4   )r   r   Nrc   rd   re   rf   r!   rh   r,   ri   r9   rU   r`   rk   rl   r4   r2   r   r     s$    {{E399"9r4   r   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxRobertaOutputi  r8   r9   c                    [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [         R                  " U R                  R                  S9U l        [         R                  " U R                  R                  U R                  S9U l        g )Nr   r?   r=   )rA   rx   r8   rD   rE   rF   rG   rH   r9   r   rP   rQ   rR   rN   rO   rS   s    r2   rU   FlaxRobertaOutput.setup  s    XXKK##++224;;3P3PQ**


 zzt{{'F'FGdkk.H.HPTPZPZ[r4   rW   c                 l    U R                  U5      nU R                  XS9nU R                  X-   5      nU$ r   r   )rT   r_   attention_outputrW   s       r2   r`   FlaxRobertaOutput.__call__  s7    

=1]P}'GHr4   r   Nrb   r   rl   r4   r2   r   r     s1    {{E399"\t  r4   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   S r	     SS\
\R                     S\
\R                     S\S	\S
\4
S jjrSrg)FlaxRobertaLayeri  r8   r9   c                    [        U R                  U R                  R                  U R                  S9U l        [        U R                  U R                  S9U l        [        U R                  U R                  S9U l        U R                  R                  (       a%  [        U R                  SU R                  S9U l
        g g )Nr   rt   F)r   r8   
is_decoderr9   	attentionr   intermediater   r   add_cross_attentioncrossattentionrS   s    r2   rU   FlaxRobertaLayer.setup  s    -dkk$++BXBX`d`j`jk3DKKtzzR'4::F;;**"6t{{5X\XbXb"cD +r4   Nencoder_hidden_statesencoder_attention_maskr   rW   r   c	           	          U R                  UUUUUUS9n	U	S   n
Ub  U R                  U
UUUUUS9nUS   n
U R                  U
5      nU R                  XUS9nU4nU(       a  XS   4-  nUb	  UWS   4-  nU$ )N)r   r   rW   r   r   )r\   r   r   rW   r   rY   r    r   r   r   r   )rT   r_   r\   r   r   r   r   rW   r   attention_outputsr   cross_attention_outputsr   s                r2   r`   FlaxRobertaLayer.__call__  s     !NN+!'/ + 
 -Q/ !,&*&9&9 5 /!6+"3 ': '#  7q9))*:;MS`a "!,..G$03A688r4   r   )NNFTF)rc   rd   re   rf   r!   rh   r,   ri   r9   rU   r   r   rj   r`   rk   rl   r4   r2   r   r     s    {{E399"d 8<8< ""'+
  (4+ !) 5+ + +  + +r4   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r       SS\\R                     S	\\R                     S
\
S\
S\
S\
S\
4S jjrSrg)FlaxRobertaLayerCollectioni  r8   r9   Fgradient_checkpointingc           	         U R                   (       ag  [        [        SS9n[        U R                  R
                  5       Vs/ s H(  nU" U R                  [        U5      U R                  S9PM*     snU l        g [        U R                  R
                  5       Vs/ s H+  n[        U R                  [        U5      U R                  S9PM-     snU l        g s  snf s  snf )N)         )static_argnums)namer9   )	r  rematr   ranger8   num_hidden_layersstrr9   layers)rT   FlaxRobertaCheckpointLayeris      r2   rU    FlaxRobertaLayerCollection.setup	  s    &&)./?PY)Z& t{{<<==A +4;;SV4::V=DK t{{<<==A !3q6L=DK
s   /C2CNr   r   r   rW   r   output_hidden_statesreturn_dictc                 4   U(       a  SOS nU	(       a  SOS nU(       a  Ub  SOS nUbX  UR                   S   [        U R                  5      :w  a2  [        S[        U R                  5       SUR                   S    S35      e[	        U R                  5       HL  u  pU	(       a  X4-  nU" UUUb  X>   OS UUUUU5      nUS   nU(       d  M5  UUS   4-  nUc  MC  UUS   4-  nMN     U	(       a  X4-  nXX4nU
(       d  [        S U 5       5      $ [        UUUUS	9$ )
Nrl   r   z&The head_mask should be specified for z/ layers, but it is for                         .r    r$   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rl   ).0vs     r2   	<genexpr>6FlaxRobertaLayerCollection.__call__.<locals>.<genexpr>L  s     =GqGs   	)last_hidden_stater_   
attentionscross_attentions)r+   r   r  rw   	enumerater   r   )rT   r_   r\   	head_maskr   r   r   rW   r   r  r  all_attentionsall_hidden_statesall_cross_attentionsr  layerlayer_outputsr   s                     r2   r`   #FlaxRobertaLayerCollection.__call__  sZ     1d"6BD&7<Q<]rdh  q!c$++&67 <S=M<N O'ooa014 
 "$++.HA#!%55!! ) 5	4%&!	M *!,M  =#3"55(4(]1-=,??(+ /.  !11 ^Z=G===<++%1	
 	
r4   )r  NNFTFFTrc   rd   re   rf   r!   rh   r,   ri   r9   r  rj   rU   r   r   r`   rk   rl   r4   r2   r  r    s    {{E399"#(D($ 8<8< ""'%* =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
 =
r4   r  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r       SS\\R                     S	\\R                     S
\
S\
S\
S\
S\
4S jjrSrg)FlaxRobertaEncoderiW  r8   r9   Fr  c                 `    [        U R                  U R                  U R                  S9U l        g )Nr9   r  )r  r8   r9   r  r&  rS   s    r2   rU   FlaxRobertaEncoder.setup\  s%    /KK**#'#>#>

r4   Nr   r   r   rW   r   r  r  c                 2    U R                  UUUUUUUUU	U
S9
$ )N)r"  r   r   r   rW   r   r  r  r&  )rT   r_   r\   r"  r   r   r   rW   r   r  r  s              r2   r`   FlaxRobertaEncoder.__call__c  s8     zz"7#9!'/!5#  
 	
r4   r1  r)  r*  rl   r4   r2   r,  r,  W  s    {{E399"#(D(
 8<8< ""'%* 

  (4
 !) 5
 
 
  
 #
 
 
r4   r,  c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxRobertaPooleri  r8   r9   c                     [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        g r   )
rA   rx   r8   rD   rE   rF   rG   rH   r9   r   rS   s    r2   rU   FlaxRobertaPooler.setup  sH    XXKK##++224;;3P3PQ**

r4   c                 b    US S 2S4   nU R                  U5      n[        R                  " U5      $ )Nr   )r   rA   tanh)rT   r_   cls_hidden_states      r2   r`   FlaxRobertaPooler.__call__  s1    (A.::&67ww'((r4   )r   Nr   rl   r4   r2   r4  r4    s$    {{E399"
)r4   r4  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   \	R                  R                  R                  r\S\R                   4   \S'   S rS
S jrS	rg)FlaxRobertaLMHeadi  r8   r9   .	bias_initc                    [         R                  " U R                  R                  U R                  [
        R                   R                  R                  U R                  R                  5      S9U l	        [         R                  " U R                  R                  U R                  S9U l        [         R                  " U R                  R                  U R                  S[
        R                   R                  R                  U R                  R                  5      S9U l        U R                  SU R                   U R                  R                  45      U l        g )Nrr   r=   F)r9   use_biasrs   r   )rA   rx   r8   rD   r9   rE   rF   rG   rH   r   rN   rO   
layer_normrC   decoderparamr=  r   rS   s    r2   rU   FlaxRobertaLMHead.setup  s    XXKK##**++224;;3P3PQ


 ,,t{{/I/IQUQ[Q[\xxKK""**++224;;3P3PQ	
 JJvt~~8N8N7PQ	r4   Nc                 D   U R                  U5      n[        S   " U5      nU R                  U5      nUb+  U R                  R	                  SSUR
                  00U5      nOU R                  U5      n[        R                  " U R                  U R                  5      nX-  nU$ )Ngeluparamskernel)
r   r   r@  rA  applyTr,   asarrayr   r9   )rT   r_   shared_embeddingr   s       r2   r`   FlaxRobertaLMHead.__call__  s    

=1v}56' LL..8EUEWEW:X/Y[hiM LL7M{{499djj1r4   )r   rA  r   r@  r   )rc   rd   re   rf   r!   rh   r,   ri   r9   rE   rA   rF   r   r=  r   npr   rU   r`   rk   rl   r4   r2   r<  r<    sL    {{E399"+.66+>+>+D+DIxRZZ(DRr4   r<  c                   f    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	SS jr
Srg)	FlaxRobertaClassificationHeadi  r8   r9   c                    [         R                  " U R                  R                  U R                  [
        R                   R                  R                  U R                  R                  5      S9U l	        U R                  R                  b  U R                  R                  OU R                  R                  n[         R                  " US9U l        [         R                  " U R                  R                  U R                  [
        R                   R                  R                  U R                  R                  5      S9U l        g )Nrr   r?   )rA   rx   r8   rD   r9   rE   rF   rG   rH   r   classifier_dropoutrQ   rP   rR   
num_labelsout_projrT   rQ  s     r2   rU   #FlaxRobertaClassificationHead.setup  s    XXKK##**++224;;3P3PQ

 {{--9 KK**00 	
 zz'9:KK""**++224;;3P3PQ
r4   c                     US S 2SS S 24   nU R                  XS9nU R                  U5      n[        R                  " U5      nU R                  XS9nU R	                  U5      nU$ )Nr   rY   )rR   r   rA   r8  rS  )rT   r_   rW   s      r2   r`   &FlaxRobertaClassificationHead.__call__  sb    %aAg.]P

=1.]Pm4r4   )r   rR   rS  Nrb   r   rl   r4   r2   rO  rO    s$    {{E399"
$r4   rO  c                     ^  \ rS rSr% Sr\rSrSr\	R                  \S'   SS\R                  SS	4S
\S\S\S\R                   S\S\4U 4S jjjrS rS S\R*                  R,                  S\S\S\4S jjrS r\" \R9                  S5      5                   S!S\\   S\R*                  R,                  S\S\\   S\\   S\\   S\\   4S jj5       rSr U =r!$ )"FlaxRobertaPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
robertaNmodule_class)r    r    r   TFr8   input_shapeseedr9   _do_initr  c           	      N   > U R                   " SXUS.UD6n[        T	U ]	  XX#XES9  g )Nr8   r9   r  )r\  r]  r9   r^  rl   )r[  super__init__)
rT   r8   r\  r]  r9   r^  r  kwargsmodule	__class__s
            r2   rb  #FlaxRobertaPreTrainedModel.__init__  s6     ""w&Vlwpvw[SXlr4   c                 X    U R                  U R                  U R                  SS9U l        g )NTr`  )r[  r8   r9   _modulerS   s    r2   enable_gradient_checkpointing8FlaxRobertaPreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r4   rngrF  returnc                 |   [         R                  " USS9n[         R                  " U5      n[        X@R                  R
                  5      n[         R                  " U5      n[         R                  " U R                  R                  U R                  R                  45      n[        R                  R                  U5      u  pXS.nU R                  R                  (       aQ  [         R                  " X R                  R                  4-   5      nUnU R                  R                  UUUUUUUUSS9	nOU R                  R                  XXuXhSS9nUS   nUbf  [!        [#        U5      5      n[!        [#        U5      5      nU R$                   H  nUU   UU'   M     ['        5       U l        [)        [+        U5      5      $ U$ )Nr#   rt   )rF  rR   F)r  rF  )r,   r   	ones_liker3   r8   pad_token_idr|   r  ru   rE   randomsplitr   rD   rd  initr   r   _missing_keyssetr   r   )rT   rk  r\  rF  r.   rZ   r[   r\   r"  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r2   init_weights'FlaxRobertaPreTrainedModel.init_weights  s   IIk6	y19)[[E]E]^y1HHdkk;;T[[=\=\]^	"%**"2"23"7
$=;;**$'IIk[[=T=T<V.V$W!%3""&++"2"2%&! #3 
# #'++"2"2fk #3 # ,H5(-)@AM!(6"23F#11&3K&@{#  2!$D.011  r4   c           	         [         R                  " X4SS9n[         R                  " USS9n[         R                  " [         R                  " [         R
                  " U5      R                  S   5      UR                  5      nU R                  R                  [        R                  R                  S5      X4USSS9n[        US   5      $ )	a  
Args:
    batch_size (`int`):
        batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
    max_length (`int`):
        maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
        cache.
r#   rt   r%   r   FT)r  r   r   )r,   r|   rn  r   r   
atleast_2dr+   rd  rr  rE   rp  PRNGKeyr   )rT   r   r   r.   r\   r[   init_variabless          r2   r   %FlaxRobertaPreTrainedModel.init_cache  s     HHj5TB	y=''

3>>)3L3R3RSU3V(WYbYhYhi))JJq!9lX]jn * 
 w/00r4   zbatch_size, sequence_lengthr   trainr   r  r  past_key_valuesc                 .   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [        R
                  " U5      nUc  [        XR                   R                  5      nUc  [        R                  " U5      nUc@  [        R                  " U R                   R                  U R                   R                  45      n0 nU	b  XS'   SU=(       d    U R                  0nU R                   R                  (       a  U(       a	  UUS'   S/nOSnU R                  R                  U[        R                   " USS9[        R                   " USS9[        R                   " USS9[        R                   " USS9[        R                   " USS9UUU
(       + UUUUUS9nUb  U(       a  Uu  nn[#        US   5      US'   U$ Ub'  U(       d   Uu  nnUS S	 [#        US   5      4-   US	S  -   nU$ U R                  R                  U[        R                   " USS9[        R                   " USS9[        R                   " USS9[        R                   " USS9[        R                   " USS9U
(       + UUUUS
9nU$ )NrR   rF  r   Fr#   rt   )rZ   r[   r"  r   r   rW   r   r  r  rv  mutabler  r    )rZ   r[   r"  rW   r   r  r  rv  )r8   r   r  r  r,   
zeros_liker3   ro  rn  r|   r  ru   rF  r   rd  rH  r   r   )rT   r.   r\   rZ   r[   r"  r   r   rF  r   r  r   r  r  r  rv  inputsr  r   s                      r2   r`   #FlaxRobertaPreTrainedModel.__call__1  s|   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ^^I6N=iIaIabL! ]]95N$++"?"?A`A`!abI ")OF1dkk2;;** "1w")kk''		)40		.5"yytD YY|4@))IT:&;'="'i"3%9' ( G$ *{+2(-5og6N-O)* ,[+2(!"1+/'2J)K(MMPWXYXZP[["  kk''		)40		.5"yytD YY|4@))IT:"'i"3%9' ( G r4   )rs  rh  r   )NNNNNNNNFNNNN)"rc   rd   re   rf   rg   r!   config_classbase_model_prefixr[  rA   Modulerh   r,   ri   r   intr9   rj   rb  ri  rE   rp  r~  r   rz  r   r   ROBERTA_INPUTS_DOCSTRINGformatr   dictr`   rk   __classcell__)re  s   @r2   rY  rY    s}   
 !L!"L"))"
 $;;',mm m 	m
 yym m !%m m
(!

 2 2 (! (!PZ (!fp (!V1& ++C+J+JKh+ij "#!%*.,0/3&**.^ ^ ZZ''^ ^ $D>^ 'tn^ d^^ "$^ k^r4   rY  c                   D   \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   Sr\
\S'   S r          SS
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\
S\
S\
S\
S\
4S jjrSrg	)FlaxRobertaModulei  r8   r9   Tadd_pooling_layerFr  c                     [        U R                  U R                  S9U l        [	        U R                  U R                  U R
                  S9U l        [        U R                  U R                  S9U l        g )Nrt   r.  )	r6   r8   r9   
embeddingsr,  r  encoderr4  poolerrS   s    r2   rU   FlaxRobertaModule.setup  sS    /4::N)KK**#'#>#>

 (4::Fr4   NrZ   r[   r"  r   r   r   rW   r   r  r  c                    Uc  [         R                  " U5      nUcV  [         R                  " [         R                  " [         R                  " U5      R
                  S   5      UR
                  5      nU R                  XXBU	S9nU R                  UUUU	UUUU
UUS9
nUS   nU R                  (       a  U R                  U5      OS nU(       d  Uc	  U4USS  -   $ X4USS  -   $ [        UUUR                  UR                  UR                  S9$ )Nr%   rY   )r"  rW   r   r   r   r   r  r  r   r    )r  pooler_outputr_   r  r   )r,   r  r   r   r}  r+   r  r  r  r  r   r_   r  r   )rT   r.   r\   rZ   r[   r"  r   r   r   rW   r   r  r  r_   r   pooleds                   r2   r`   FlaxRobertaModule.__call__  s&     ! ^^I6N ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL|S` ( 
 ,,'"7#9!/!5#  
  
/3/E/E]+4~%''!"+55!*WQR[88?+ !//))$55
 	
r4   )r  r  r  )
NNNNNFTFFT)rc   rd   re   rf   r!   rh   r,   ri   r9   r  rj   r  rU   r   r   r`   rk   rl   r4   r2   r  r    s    {{E399""t"#(D(G 15.2+/7;8< ""'%* 5
 !-	5

 s{{+5
 CKK(5
  (45
 !) 55
 5
 5
  5
 #5
 5
 5
r4   r  zaThe bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       \ rS rSr\rSrg)FlaxRobertaModeli  rl   N)rc   rd   re   rf   r  r[  rk   rl   r4   r2   r  r    s	    
 %Lr4   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)FlaxRobertaForMaskedLMModulei  r8   r9   Fr  c                     [        U R                  SU R                  U R                  S9U l        [        U R                  U R                  S9U l        g NF)r8   r  r9   r  r8   r9   r  r8   r9   r  rZ  r<  lm_headrS   s    r2   rU   "FlaxRobertaForMaskedLMModule.setup  @    (;;#**#'#>#>	
 )4::Nr4   rW   r   r  r  c
                 6   U R                  UUUUUUUUU	S9	n
U
S   nU R                  R                  (       a#  U R                   R                  S   S   S   S   nOS nU R	                  XS9nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R                  S	9$ )
NrW   r   r  r  r   rF  r  rI   	embeddingrK  r    logitsr_   r  )rZ  r8   tie_word_embeddingsr   r  r   r_   r  )rT   r.   r\   rZ   r[   r"  rW   r   r  r  r   r_   rK  r  s                 r2   r`   %FlaxRobertaForMaskedLMModule.__call__  s     ,,'/!5#  

  
;;**#||55h?MN_`alm# mO9wqr{**!!//))
 	
r4   r  rZ  NTFFTrc   rd   re   rf   r!   rh   r,   ri   r9   r  rj   rU   r`   rk   rl   r4   r2   r  r    sk    {{E399"#(D(O  #"'%* )
 )
  )
 #)
 )
 )
r4   r  z5RoBERTa Model with a `language modeling` head on top.c                       \ rS rSr\rSrg)FlaxRobertaForMaskedLMi   rl   N)rc   rd   re   rf   r  r[  rk   rl   r4   r2   r  r     s    /Lr4   r  z<mask>)r0   c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)*FlaxRobertaForSequenceClassificationModulei.  r8   r9   Fr  c                     [        U R                  U R                  SU R                  S9U l        [        U R                  U R                  S9U l        g )NFr8   r9   r  r  r  )r  r8   r9   r  rZ  rO  
classifierrS   s    r2   rU   0FlaxRobertaForSequenceClassificationModule.setup3  sC    (;;**##'#>#>	
 8t{{RVR\R\]r4   rW   r   r  r  c
                     U R                  UUUUUUUUU	S9	n
U
S   nU R                  XS9nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R                  S9$ Nr  r   rY   r    r  )rZ  r  r   r_   r  )rT   r.   r\   rZ   r[   r"  rW   r   r  r  r   sequence_outputr  s                r2   r`   3FlaxRobertaForSequenceClassificationModule.__call__<  s     ,,'/!5#  

 "!*N9wqr{**+!//))
 	
r4   )r  rZ  Nr  r  rl   r4   r2   r  r  .  sk    {{E399"#(D(^  #"'%* #
 #
  #
 ##
 #
 #
r4   r  z
    Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       \ rS rSr\rSrg)$FlaxRobertaForSequenceClassificationib  rl   N)rc   rd   re   rf   r  r[  rk   rl   r4   r2   r  r  b  s	     >Lr4   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)"FlaxRobertaForMultipleChoiceModuleiv  r8   r9   Fr  c                    [        U R                  U R                  U R                  S9U l        [
        R                  " U R                  R                  S9U l        [
        R                  " SU R                  S9U l
        g )Nr`  r?   r    rt   )r  r8   r9   r  rZ  rA   rP   rQ   rR   rx   r  rS   s    r2   rU   (FlaxRobertaForMultipleChoiceModule.setup{  sW    (;;**#'#>#>

 zzt{{'F'FG((1DJJ7r4   rW   r   r  r  c
                 :   UR                   S   n
Ub  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nU R                  UUUUUUUUU	S9	nUS   nU R                  XS9nU R	                  U5      nUR                  SU
5      nU	(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )Nr    r%   r  rY   r$   r  )r+   r*   rZ  rR   r  r   r_   r  )rT   r.   r\   rZ   r[   r"  rW   r   r  r  num_choicesr   pooled_outputr  reshaped_logitss                  r2   r`   +FlaxRobertaForMultipleChoiceModule.__call__  sF     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ,,'/!5#  

  
]P/ ..[9#%33,"!//))
 	
r4   r  rR   rZ  Nr  r  rl   r4   r2   r  r  v  sj    {{E399"#(D(8  #"'%* ,
 ,
  ,
 #,
 ,
 ,
r4   r  z
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       \ rS rSr\rSrg)FlaxRobertaForMultipleChoicei  rl   N)rc   rd   re   rf   r  r[  rk   rl   r4   r2   r  r    s	     6Lr4   r  z(batch_size, num_choices, sequence_lengthc            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)'FlaxRobertaForTokenClassificationModulei  r8   r9   Fr  c                    [        U R                  U R                  SU R                  S9U l        U R                  R
                  b  U R                  R
                  OU R                  R                  n[        R                  " US9U l	        [        R                  " U R                  R                  U R                  S9U l        g )NFr  r?   rt   )r  r8   r9   r  rZ  rQ  rQ   rA   rP   rR   rx   rR  r  rT  s     r2   rU   -FlaxRobertaForTokenClassificationModule.setup  s    (;;**##'#>#>	
 {{--9 KK**00 	
 zz'9:((4;;#9#9Lr4   rW   r   r  r  c
                     U R                  UUUUUUUUU	S9	n
U
S   nU R                  XS9nU R                  U5      nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R
                  S9$ r  )rZ  rR   r  r   r_   r  )rT   r.   r\   rZ   r[   r"  rW   r   r  r  r   r_   r  s                r2   r`   0FlaxRobertaForTokenClassificationModule.__call__  s     ,,'/!5#  

  
]P/9wqr{**(!//))
 	
r4   r  Nr  r  rl   r4   r2   r  r    sk    {{E399"#(D(M, #"'%* $
 $
  $
 #$
 $
 $
r4   r  z
    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       \ rS rSr\rSrg)!FlaxRobertaForTokenClassificationi  rl   N)rc   rd   re   rf   r  r[  rk   rl   r4   r2   r  r    s	     ;Lr4   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)%FlaxRobertaForQuestionAnsweringModulei  r8   r9   Fr  c                     [        U R                  U R                  SU R                  S9U l        [
        R                  " U R                  R                  U R                  S9U l        g )NFr  rt   )	r  r8   r9   r  rZ  rA   rx   rR  
qa_outputsrS   s    r2   rU   +FlaxRobertaForQuestionAnsweringModule.setup  sJ    (;;**##'#>#>	
 ((4;;#9#9Lr4   rW   r   r  r  c
                 V   U R                  UUUUUUUUU	S9	n
U
S   nU R                  U5      n[        R                  " XR                  R
                  SS9u  pUR                  S5      nUR                  S5      nU	(       d	  X4U
SS  -   $ [        UUU
R                  U
R                  S9$ )Nr  r   r%   r&   r    )start_logits
end_logitsr_   r  )
rZ  r  r,   rq  r8   rR  squeezer   r_   r  )rT   r.   r\   rZ   r[   r"  rW   r   r  r  r   r_   r  r  r  s                  r2   r`   .FlaxRobertaForQuestionAnsweringModule.__call__'  s     ,,'/!5#  

  
/#&99V[[5K5KRT#U #++B/''+
 -;;/%!!//))	
 	
r4   )r  rZ  Nr  r  rl   r4   r2   r  r    sk    {{E399"#(D(M  #"'%* (
 (
  (
 #(
 (
 (
r4   r  z
    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       \ rS rSr\rSrg)FlaxRobertaForQuestionAnsweringiR  rl   N)rc   rd   re   rf   r  r[  rk   rl   r4   r2   r  r  R  s	     9Lr4   r  c                      \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r         SS\\R                     S	\\R                     S
\\R                     S\\R                     S\
S\
S\
S\
S\
4S jjrSrg)FlaxRobertaForCausalLMModuleie  r8   r9   Fr  c                     [        U R                  SU R                  U R                  S9U l        [        U R                  U R                  S9U l        g r  r  rS   s    r2   rU   "FlaxRobertaForCausalLMModule.setupj  r  r4   NrZ   r"  r   r   r   rW   r   r  r  c                 R   U R                  UUUUUUUUU	U
UUS9nUS   nU R                  R                  (       a#  U R                   R                  S   S   S   S   nOS nU R	                  XS9nU(       d	  U4USS  -   $ [        UUR                  UR                  UR                  S	9$ )
N)r   r   r   rW   r   r  r  r   rF  r  rI   r  r  r    )r  r_   r  r   )	rZ  r8   r  r   r  r   r_   r  r   )rT   r.   r\   r[   rZ   r"  r   r   r   rW   r   r  r  r   r_   rK  r  s                    r2   r`   %FlaxRobertaForCausalLMModule.__call__s  s      ,,"7#9!'/!5#  
  
;;**#||55h?MN_`alm# mO9wqr{**4!//))$55	
 	
r4   r  )	NNNNFTFFTr*  rl   r4   r2   r  r  e  s    {{E399"#(D(O 15+/7;8< ""'%* 0

 !-0
 CKK(0
  (40
 !) 50
 0
 0
  0
 #0
 0
 0
r4   r  z
    Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   J    \ rS rSr\rSS\\R                     4S jjr	S r
Srg)FlaxRobertaForCausalLMi  Nr\   c                 6   UR                   u  pEU R                  XB5      n[        R                  " XB4SS9nUb*  UR	                  SS9S-
  n[
        R                  " XsS5      nO2[        R                  " [        R                  " USS9S S S 24   XE45      nUUUS.$ )Nr#   rt   r%   r&   r    )r   r   )r  r\   r[   )	r+   r   r,   r|   r-   r   r   r   r   )	rT   r.   r   r\   r   
seq_lengthr  extended_attention_maskr[   s	            r2   prepare_inputs_for_generation4FlaxRobertaForCausalLM.prepare_inputs_for_generation  s    !*
//*A #&((J+C4"P%)00b09A=L&)&>&>?Vhn&o#++CJJz,NtUVw,WZdYqrL  /5(
 	
r4   c                 L    UR                   US'   US   S S 2SS 24   S-   US'   U$ )Nr  r[   r%   r    )r  )rT   model_outputsmodel_kwargss      r2   update_inputs_for_generation3FlaxRobertaForCausalLM.update_inputs_for_generation  s8    *7*G*G&''3N'CArsF'Ka'O^$r4   rl   r   )rc   rd   re   rf   r  r[  r   rE   Arrayr  r  rk   rl   r4   r2   r  r    s'     0L
S[\_\e\eSf 
*r4   r  )r  r  r  r  r  r  r  rY  )Utypingr   r   
flax.linenlinenrA   rE   	jax.numpynumpyr,   rM  flax.core.frozen_dictr   r   r   r   r	   r
   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_robertar!   
get_loggerrc   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr  r3   ROBERTA_START_DOCSTRINGr  r  r6   rn   r   r   r   r   r   r  r,  r4  r<  rO  rY  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  __all__rl   r4   r2   <module>r     s   &  
   > > 6 6 > ; 
 
 
 w v Y Y 0 
		H	%/ !:0 .# N(BII (Xhryy hXBII *'299 'Vbii &		 *6ryy 6tO
 O
f$
 $
P)		 )" 		  FBII @}!4 }BD
		 D
N g%1 %	% -/BDbds t7
299 7
t QSjk07 0 l0 "	1
 1
h  >+E >> ( 	:
 :
z  6#= 66  ":"A"ABl"m  !	8
bii 8
v  ;(B ;; %	6
BII 6
r  9&@ 99 #$	>
299 >
B  7 < )		r4   