
    cCi                        S SK JrJr  S SKrS SKJr  S SKrS SKJ	r
  S SK	rS SKJrJrJr  S SKJrJr  S SKJr  S SKJr  S SKJrJr  S SKJr  S	S
KJrJrJrJrJrJ r J!r!J"r"  S	SK#J$r$J%r%J&r&J'r'J(r(  S	SK)J*r*J+r+J,r,J-r-  SSK.J/r/  \-R`                  " \15      r2Sr3Sr4\Rj                  r5\Rl                  Rn                   " S S\*5      5       r8Sr9Sr: " S S\Rv                  5      r< " S S\Rv                  5      r= " S S\Rv                  5      r> " S S\Rv                  5      r? " S S\Rv                  5      r@ " S S \Rv                  5      rA " S! S"\Rv                  5      rB " S# S$\Rv                  5      rC " S% S&\Rv                  5      rD " S' S(\Rv                  5      rE " S) S*\Rv                  5      rF " S+ S,\%5      rG " S- S.\Rv                  5      rH\+" S/\95       " S0 S1\G5      5       rI\&" \I\3\\45         " S2 S3\Rv                  5      rJ " S4 S5\Rv                  5      rK\+" S6\95       " S7 S8\G5      5       rL\&" \L\3\\45         " S9 S:\Rv                  5      rM\+" S;\95       " S< S=\G5      5       rNS>rO\(" \N\:R                  S?5      \O-   5        \'" \N\8\4S@9   " SA SB\Rv                  5      rQ\+" SC\95       " SD SE\G5      5       rR\&" \R\3\"\45        SF rS " SG SH\Rv                  5      rT " SI SJ\Rv                  5      rU\+" SK\95       " SL SM\G5      5       rV\(" \V\:R                  SN5      5        \&" \V\3\\45         " SO SP\Rv                  5      rW\+" SQ\95       " SR SS\G5      5       rX\&" \X\3\ \45         " ST SU\Rv                  5      rY " SV SW\Rv                  5      rZ\+" SX\95       " SY SZ\G5      5       r[\&" \[\3\!\45         " S[ S\\Rv                  5      r\\+" S]\95       " S^ S_\G5      5       r]\&" \]\3\\45        / S`Qr^g)a    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutput-FlaxBaseModelOutputWithPastAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )ElectraConfigz"google/electra-small-discriminatorr"   c                       \ rS rSr% SrSr\R                  \S'   Sr	\
\\R                        \S'   Sr\
\\R                        \S'   Srg)FlaxElectraForPreTrainingOutput;   a)  
Output type of [`ElectraForPreTraining`].

Args:
    logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlogitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r&   jnpndarray__annotations__r'   r   tupler(   __static_attributes__r)       k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/electra/modeling_flax_electra.pyr$   r$   ;   sG    & FCKK26M8E#++./6/3Js{{+,3r4   r$   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   r    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S
S\4S jjrSrg	)FlaxElectraEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l	        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l        [         R                  " U R                  R                  U R                   S9U l        [         R"                  " U R                  R$                  S9U l        g )N)stddev)embedding_initepsilonr:   rate)nnEmbedr9   
vocab_sizeembedding_sizejaxinitializersnormalinitializer_rangeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr:   Dropouthidden_dropout_probdropoutselfs    r5   setupFlaxElectraEmbeddings.setup   s5   !xxKK""KK&&66..55T[[=Z=Z5[ 

 $&88KK//KK&&66..55T[[=Z=Z5[$
 
 &(XXKK''KK&&66..55T[[=Z=Z5[&
"
 dkk.H.HPTPZPZ[zzt{{'F'FGr4   deterministicc                    U R                  UR                  S5      5      nU R                  UR                  S5      5      nU R                  UR                  S5      5      nXh-   U-   n	U R	                  U	5      n	U R                  XS9n	U	$ )Ni4rX   )rJ   astyperL   rN   rO   rS   )
rU   	input_idstoken_type_idsposition_idsattention_maskrX   inputs_embedsposition_embedsrN   r'   s
             r5   __call__FlaxElectraEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &=O }5]Pr4   )rO   rS   rL   rN   rJ   NTr*   r+   r,   r-   r.   r"   r1   r/   float32r:   rV   boolrc   r3   r)   r4   r5   r7   r7      s5    Q{{E399"H(_c  r4   r7   c                       \ rS rSr% \\S'   Sr\\S'   \R                  r
\R                  \S'   S rS rS r\R                  S	 5       r    SS\\R$                     S\S\4S jjrSrg
)FlaxElectraSelfAttention   r9   Fcausalr:   c                 ,   U R                   R                  U R                   R                  -  U l        U R                   R                  U R                   R                  -  S:w  a  [	        S5      e[
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        [
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        [
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        U R                  (       a9  [!        ["        R$                  " SU R                   R&                  4SS9SS9U l        g g )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r:   kernel_initr!   rh   r:   )r9   hidden_sizenum_attention_headshead_dim
ValueErrorrB   Denser:   rF   rG   rH   rI   querykeyvaluerl   r	   r/   onesrK   causal_maskrT   s    r5   rV   FlaxElectraSelfAttention.setup   si   //4;;3R3RR;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ

 ;;/!T[[@@APX^ D r4   c                     UR                  UR                  S S U R                  R                  U R                  4-   5      $ N   )reshapeshaper9   rq   rr   rU   r'   s     r5   _split_heads%FlaxElectraSelfAttention._split_heads   s;    $$]%8%8!%<@_@_aeanan?o%oppr4   c                 n    UR                  UR                  S S U R                  R                  4-   5      $ r|   )r~   r   r9   rp   r   s     r5   _merge_heads%FlaxElectraSelfAttention._merge_heads   s2    $$]%8%8!%<@W@W?Y%YZZr4   c                 (   U R                  SS5      nU R                  SS[        R                  UR                  UR
                  5      nU R                  SS[        R                  UR                  UR
                  5      nU R                  SSS 5      nU(       a  UR                  R                  Gt ppUR                  nS[        U	5      -  USS4-   n[        R                  " UR                  X5      n[        R                  " UR                  X.5      nXl        X'l        UR                  S   nUR                  U-   Ul        [        R                  " [        R                  " U
5      X-   :  [        U	5      SX4-   5      n[        UU5      nXU4$ )	a<  
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slightly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
cache
cached_keycached_valuecache_indexc                  H    [         R                  " S[         R                  S9$ )Nr   ro   )r/   arrayint32r)   r4   r5   <lambda>@FlaxElectraSelfAttention._concatenate_to_cache.<locals>.<lambda>   s    CIIaWZW`W`Dar4   r   r   r!   )has_variablevariabler/   zerosr   r:   rw   lenr   dynamic_update_slicebroadcast_toaranger2   r   )rU   rv   rw   ru   r`   is_initializedr   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r5   _concatenate_to_cache.FlaxElectraSelfAttention._concatenate_to_cache   s_    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G**:+;+;SJC,,\-?-?PE"!&(-A% + 1 14M MK''

:&)NNj!Q(A$NNH +8^DN>))r4   Nkey_value_states
init_cacheoutput_attentionsc                    US LnUR                   S   n	U R                  U5      n
U(       a#  U R                  U5      nU R                  U5      nO"U R                  U5      nU R                  U5      nU R	                  U
5      n
U R	                  U5      nU R	                  U5      nU R
                  (       a  U
R                   S   UR                   S   pU R                  SS5      (       a\  U R                  S   S   nU R                  S   S   R                   S   n[        R                  " U R                  SSUS4SSUU45      nOU R                  S S 2S S 2S U2S U24   n[        R                  " UU	4UR                   SS  -   5      nUbR  U R
                  (       aA  [        R                  " [        R                  " USS9WR                   5      n[        UU5      nO,U R
                  (       a  WnOUb  [        R                  " USS9nU R
                  (       a3  U R                  SS5      (       d  U(       a  U R                  XX5      u  pnUb  [        R                   " US:  [        R"                  " UR                   S5      R%                  U R&                  5      [        R"                  " UR                   [        R(                  " U R&                  5      R*                  5      R%                  U R&                  5      5      nOS nS nU(       d+  U R,                  R.                  S:  a  U R1                  S	5      n[3        U
UUUU R,                  R.                  S
UU R&                  S S9	nUb  [        R4                  " SUU5      n[        R4                  " SUU5      nUR7                  UR                   S S S-   5      nU(       a  UU4nU$ U4nU$ )Nr   r!   r   r   r   )axisg        rS   T)biasdropout_rngdropout_ratebroadcast_dropoutrX   r:   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdr}   ))r   ru   rv   rw   r   rl   r   	variablesr   dynamic_slicery   r/   r   expand_dimsr   r   selectfullr\   r:   finfominr9   attention_probs_dropout_probmake_rngr   einsumr~   )rU   r'   r`   layer_head_maskr   r   rX   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthry   attention_biasr   attn_weightsattn_outputoutputss                          r5   rc   !FlaxElectraSelfAttention.__call__  si    .T9"((+
 zz-0"23J::&67L -0J::m4L((6&&z2
((6 ;;'3'9'9!'<j>N>Nq>Q*  ,77!^^G4]C
%)^^G%<\%J%P%PQR%S"!//$$q!Z&;aLRd=e #..q!]l]KZK/OP**;HYHYZ[Z\H]8]^K %$++ --coonS[.\^i^o^opN*>;GN[[(N' __^(KN ;;D--g|DD
7;7Q7Q,84Jn
 % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 &::&8,XLjj!8,U!))+*;*;BQ*?%*GH1B;- JUr4   )ry   rr   rv   ru   rw   NFTF)r*   r+   r,   r-   r"   r1   rl   rh   r/   rg   r:   rV   r   r   rB   compactr   r   r0   rc   r3   r)   r4   r5   rj   rj      s    FD{{E399":q[ ZZ* *H 37 "'_
 #3;;/_ _  _ _r4   rj   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxElectraSelfOutputih  r9   r:   c                    [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [         R                  " U R                  R                  U R                  S9U l
        [         R                  " U R                  R                  S9U l        g )Nrn   r:   r>   r@   )rB   rt   r9   rp   rF   rG   rH   rI   r:   denserO   rP   rQ   rR   rS   rT   s    r5   rV   FlaxElectraSelfOutput.setupl  s    XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr4   rX   c                 l    U R                  U5      nU R                  XS9nU R                  X-   5      nU$ Nr[   r   rS   rO   )rU   r'   input_tensorrX   s       r5   rc   FlaxElectraSelfOutput.__call__u  s7    

=1]P}'CDr4   rO   r   rS   Nre   r*   r+   r,   r-   r"   r1   r/   rg   r:   rV   rh   rc   r3   r)   r4   r5   r   r   h  s1    {{E399"H4  r4   r   c                       \ rS rSr% \\S'   Sr\\S'   \R                  r
\R                  \S'   S r    SS\4S	 jjrS
rg)FlaxElectraAttentioni}  r9   Frl   r:   c                     [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        g )Nrl   r:   ro   )rj   r9   rl   r:   rU   r   outputrT   s    r5   rV   FlaxElectraAttention.setup  s7    ,T[[TXT^T^_	+DKKtzzJr4   Nr   c           
      ~    U R                  UUUUUUUS9nUS   n	U R                  XUS9nU4n
U(       a  XS   4-  n
U
$ )N)r   r   r   rX   r   r   r[   r!   )rU   r   )rU   r'   r`   r   r   r   rX   r   attn_outputsr   r   s              r5   rc   FlaxElectraAttention.__call__  sh     yy+-!'/ ! 
 #1oKm\ "Q))Gr4   )r   rU   r   )r*   r+   r,   r-   r"   r1   rl   rh   r/   rg   r:   rV   rc   r3   r)   r4   r5   r   r   }  sL    FD{{E399"K "'   r4   r   c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxElectraIntermediatei  r9   r:   c                 0   [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [        U R                  R                     U l        g )Nr   )rB   rt   r9   intermediate_sizerF   rG   rH   rI   r:   r   r   
hidden_act
activationrT   s    r5   rV   FlaxElectraIntermediate.setup  s`    XXKK))++224;;3P3PQ**


 !!7!78r4   c                 J    U R                  U5      nU R                  U5      nU$ N)r   r   r   s     r5   rc    FlaxElectraIntermediate.__call__  s$    

=16r4   )r   r   Nr*   r+   r,   r-   r"   r1   r/   rg   r:   rV   rc   r3   r)   r4   r5   r   r     s$    {{E399"9r4   r   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxElectraOutputi  r9   r:   c                    [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [         R                  " U R                  R                  S9U l        [         R                  " U R                  R                  U R                  S9U l        g )Nr   r@   r>   )rB   rt   r9   rp   rF   rG   rH   rI   r:   r   rQ   rR   rS   rO   rP   rT   s    r5   rV   FlaxElectraOutput.setup  s    XXKK##++224;;3P3PQ**


 zzt{{'F'FGdkk.H.HPTPZPZ[r4   rX   c                 l    U R                  U5      nU R                  XS9nU R                  X-   5      nU$ r   r   )rU   r'   attention_outputrX   s       r5   rc   FlaxElectraOutput.__call__  s7    

=1]P}'GHr4   r   Nre   r   r)   r4   r5   r   r     s1    {{E399"\t  r4   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   S r	     SS\
\R                     S\
\R                     S\S	\S
\4
S jjrSrg)FlaxElectraLayeri  r9   r:   c                    [        U R                  U R                  R                  U R                  S9U l        [        U R                  U R                  S9U l        [        U R                  U R                  S9U l        U R                  R                  (       a%  [        U R                  SU R                  S9U l
        g g )Nr   ro   F)r   r9   
is_decoderr:   	attentionr   intermediater   r   add_cross_attentioncrossattentionrT   s    r5   rV   FlaxElectraLayer.setup  s    -dkk$++BXBX`d`j`jk3DKKtzzR'4::F;;**"6t{{5X\XbXb"cD +r4   Nencoder_hidden_statesencoder_attention_maskr   rX   r   c	           	          U R                  UUUUUUS9n	U	S   n
Ub  U R                  U
UUUUUS9nUS   n
U R                  U
5      nU R                  XUS9nU4nU(       a  XS   4-  nUb	  UWS   4-  nU$ )N)r   r   rX   r   r   )r`   r   r   rX   r   r[   r!   r   r   r   r   )rU   r'   r`   r   r   r   r   rX   r   attention_outputsr   cross_attention_outputsr   s                r5   rc   FlaxElectraLayer.__call__  s     !NN+!'/ + 
 -Q/ !,&*&9&9 5 /!6+"3 ': '#  7q9))*:;MS`a "!,..G$03A688r4   r   )NNFTF)r*   r+   r,   r-   r"   r1   r/   rg   r:   rV   r   r0   rh   rc   r3   r)   r4   r5   r   r     s    {{E399"d 8<8< ""'+
  (4+ !) 5+ + +  + +r4   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r       SS\\R                     S	\\R                     S
\
S\
S\
S\
S\
4S jjrSrg)FlaxElectraLayerCollectioni
  r9   r:   Fgradient_checkpointingc           	         U R                   (       ag  [        [        SS9n[        U R                  R
                  5       Vs/ s H(  nU" U R                  [        U5      U R                  S9PM*     snU l        g [        U R                  R
                  5       Vs/ s H+  n[        U R                  [        U5      U R                  S9PM-     snU l        g s  snf s  snf )N)         )static_argnums)namer:   )	r  rematr   ranger9   num_hidden_layersstrr:   layers)rU   FlaxElectraCheckpointLayeris      r5   rV    FlaxElectraLayerCollection.setup  s    &&)./?PY)Z& t{{<<==A +4;;SV4::V=DK t{{<<==A !3q6L=DK
s   /C2CNr   r   r   rX   r   output_hidden_statesreturn_dictc                 4   U(       a  SOS nU	(       a  SOS nU(       a  Ub  SOS nUbX  UR                   S   [        U R                  5      :w  a2  [        S[        U R                  5       SUR                   S    S35      e[	        U R                  5       HL  u  pU	(       a  X4-  nU" UUUb  X>   OS UUUUU5      nUS   nU(       d  M5  UUS   4-  nUc  MC  UUS   4-  nMN     U	(       a  X4-  nXX4nU
(       d  [        S U 5       5      $ [        UUUUS	9$ )
Nr)   r   z&The head_mask should be specified for z/ layers, but it is for                         .r!   r}   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r)   ).0vs     r5   	<genexpr>6FlaxElectraLayerCollection.__call__.<locals>.<genexpr>R  s     =GqGs   	)last_hidden_stater'   r(   cross_attentions)r   r   r  rs   	enumerater2   r   )rU   r'   r`   	head_maskr   r   r   rX   r   r  r  all_attentionsall_hidden_statesall_cross_attentionsr  layerlayer_outputsr   s                     r5   rc   #FlaxElectraLayerCollection.__call__  sZ     1d"6BD&7<Q<]rdh  q!c$++&67 <S=M<N O'ooa014 
 "$++.HA#!%55!! ) 5	4%&!	M *!,M  =#3"55(4(]1-=,??(+ /.  !11 ^Z=G===<++%1	
 	
r4   )r  NNFTFFTr*   r+   r,   r-   r"   r1   r/   rg   r:   r  rh   rV   r   r0   rc   r3   r)   r4   r5   r  r  
  s    {{E399"#(D($ 8<8< ""'%* =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
 =
r4   r  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r       SS\\R                     S	\\R                     S
\
S\
S\
S\
S\
4S jjrSrg)FlaxElectraEncoderi]  r9   r:   Fr  c                 `    [        U R                  U R                  U R                  S9U l        g )Nr:   r  )r  r9   r:   r  r%  rT   s    r5   rV   FlaxElectraEncoder.setupb  s%    /KK**#'#>#>

r4   Nr   r   r   rX   r   r  r  c                 2    U R                  UUUUUUUUU	U
S9
$ )N)r!  r   r   r   rX   r   r  r  r%  )rU   r'   r`   r!  r   r   r   rX   r   r  r  s              r5   rc   FlaxElectraEncoder.__call__i  s8     zz"7#9!'/!5#  
 	
r4   r0  r(  r)  r)   r4   r5   r+  r+  ]  s    {{E399"#(D(
 8<8< ""'%* 

  (4
 !) 5
 
 
  
 #
 
 
r4   r+  c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxElectraGeneratorPredictionsi  r9   r:   c                     [         R                  " U R                  R                  U R                  S9U l        [         R
                  " U R                  R                  U R                  S9U l        g )Nr>   ro   )rB   rO   r9   rP   r:   rt   rE   r   rT   s    r5   rV   %FlaxElectraGeneratorPredictions.setup  sE    dkk.H.HPTPZPZ[XXdkk88

K
r4   c                     U R                  U5      n[        U R                  R                     " U5      nU R	                  U5      nU$ r   )r   r   r9   r   rO   r   s     r5   rc   (FlaxElectraGeneratorPredictions.__call__  s=    

=1t{{556}E}5r4   )rO   r   Nr   r)   r4   r5   r3  r3    s%    {{E399"Lr4   r3  c                   f    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S rSrg)	#FlaxElectraDiscriminatorPredictionsi  zEPrediction module for the discriminator, made up of two dense layers.r9   r:   c                     [         R                  " U R                  R                  U R                  S9U l        [         R                  " SU R                  S9U l        g )Nro   r!   )rB   rt   r9   rp   r:   r   dense_predictionrT   s    r5   rV   )FlaxElectraDiscriminatorPredictions.setup  s9    XXdkk55TZZH
 "$** =r4   c                     U R                  U5      n[        U R                  R                     " U5      nU R	                  U5      R                  S5      nU$ )Nr   )r   r   r9   r   r;  squeezer   s     r5   rc   ,FlaxElectraDiscriminatorPredictions.__call__  sJ    

=1t{{556}E--m<DDRHr4   )r   r;  N)r*   r+   r,   r-   r.   r"   r1   r/   rg   r:   rV   rc   r3   r)   r4   r5   r9  r9    s'    O{{E399">r4   r9  c                     ^  \ rS rSr% Sr\rSrSr\	R                  \S'   SS\R                  SS	4S
\S\S\S\R                   S\S\4U 4S jjjrS rS S\R*                  R,                  S\S\S\4S jjrS r\" \R9                  S5      5                   S!S\\   S\R*                  R,                  S\S\\   S\\   S\\   S\\   4S jj5       rSr U =r!$ )"FlaxElectraPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
electraNmodule_class)r!   r!   r   TFr9   input_shapeseedr:   _do_initr  c           	      N   > U R                   " SXUS.UD6n[        T	U ]	  XX#XES9  g )Nr9   r:   r  )rD  rE  r:   rF  r)   )rC  super__init__)
rU   r9   rD  rE  r:   rF  r  kwargsmodule	__class__s
            r5   rJ  #FlaxElectraPreTrainedModel.__init__  s6     ""w&Vlwpvw[SXlr4   c                 X    U R                  U R                  U R                  SS9U l        g )NTrH  )rC  r9   r:   _modulerT   s    r5   enable_gradient_checkpointing8FlaxElectraPreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r4   rngparamsreturnc                    [         R                  " USS9n[         R                  " U5      n[         R                  " [         R                  " [         R
                  " U5      R                  S   5      U5      n[         R                  " U5      n[         R                  " U R                  R                  U R                  R                  45      n[        R                  R                  U5      u  pXS.nU R                  R                  (       aQ  [         R                  " X R                  R                   4-   5      nUnU R"                  R%                  UUUUUUUUSS9	nOU R"                  R%                  XXuXhSS9nUS   nUbf  ['        [)        U5      5      n['        [)        U5      5      nU R*                   H  nUU   UU'   M     [-        5       U l        [/        [1        U5      5      $ U$ )NrZ   ro   r   )rT  rS   F)r  rT  )r/   r   
zeros_liker   r   
atleast_2dr   	ones_likerx   r9   r  rq   rF   randomsplitr   rp   rL  initr   r   _missing_keyssetr   r   )rU   rS  rD  rT  r]   r^   r_   r`   r!  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r5   init_weights'FlaxElectraPreTrainedModel.init_weights  s   IIk6		2''

3>>)3L3R3RSU3V(WYdey1HHdkk;;T[[=\=\]^	"%**"2"23"7
$=;;**$'IIk[[=T=T<V.V$W!%3""&++"2"2%&! #3 
# #'++"2"2fk #3 # ,H5(-)@AM!(6"23F#11&3K&@{#  2!$D.011  r4   c           	         [         R                  " X4SS9n[         R                  " USS9n[         R                  " [         R                  " [         R
                  " U5      R                  S   5      UR                  5      nU R                  R                  [        R                  R                  S5      X4USSS9n[        US   5      $ )	a  
Args:
    batch_size (`int`):
        batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
    max_length (`int`):
        maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
        cache.
rZ   ro   r   r   FT)r  r   r   )r/   rx   rY  r   r   rX  r   rL  r\  rF   rZ  PRNGKeyr   )rU   r   r   r]   r`   r_   init_variabless          r5   r   %FlaxElectraPreTrainedModel.init_cache  s     HHj5TB	y=''

3>>)3L3R3RSU3V(WYbYhYhi))JJq!9lX]jn * 
 w/00r4   batch_size, sequence_lengthr   trainr   r  r  past_key_valuesc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [        R
                  " U5      nUcV  [        R                  " [        R                  " [        R                  " U5      R                  S   5      UR                  5      nUc  [        R
                  " U5      nUc@  [        R                  " U R                   R                  U R                   R                  45      n0 nU	b  XS'   SU=(       d    U R                  0nU R                   R                  (       a  U(       a	  UUS'   S/nOSnU R                  R!                  U[        R"                  " USS9[        R"                  " USS9[        R"                  " USS9[        R"                  " USS9[        R"                  " USS9UUU
(       + UUUUUS9nUb  U(       a  Uu  nn[%        US   5      US	'   U$ Ub'  U(       d   Uu  nnUS S
 [%        US   5      4-   US
S  -   nU$ U R                  R!                  U[        R"                  " USS9[        R"                  " USS9[        R"                  " USS9[        R"                  " USS9[        R"                  " USS9U
(       + UUUUS9nU$ )Nr   rS   rT  r   FrZ   ro   )r^   r_   r!  r   r   rX   r   r  r  r`  mutablerl  r!   )r^   r_   r!  rX   r   r  r  r`  )r9   r   r  r  r/   rY  r   r   rX  r   rx   r  rq   rT  r   rL  applyr   r   )rU   r]   r`   r^   r_   r!  r   r   rT  r   rk  r   r  r  rl  r`  inputsrn  r   s                      r5   rc   #FlaxElectraPreTrainedModel.__call__  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ]]95N++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N$++"?"?A`A`!abI ")OF1dkk2;;** "1w")kk''		)40		.5"yytD YY|4@))IT:&;'="'i"3%9' ( G$ *{+2(-5og6N-O)* ,[+2(!"1+/'2J)K(MMPWXYXZP[["  kk''		)40		.5"yytD YY|4@))IT:"'i"3%9' ( G r4   )r]  rP  r   )NNNNNNNNFNNNN)"r*   r+   r,   r-   r.   r"   config_classbase_model_prefixrC  rB   Moduler1   r/   rg   r2   intr:   rh   rJ  rQ  rF   rZ  rg  r   rd  r   r   ELECTRA_INPUTS_DOCSTRINGformatr   dictrc   r3   __classcell__)rM  s   @r5   rA  rA    s}   
 !L!"L"))"
 $;;',mm m 	m
 yym m !%m m
(!

 2 2 (! (!PZ (!fp (!V1& ++C+J+JKh+ij "#!%*.,0/3&**.^ ^ ZZ''^ ^ $D>^ 'tn^ d^^ "$^ k^r4   rA  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r        SS\\R                     S	\\R                     S
\\R                     S\
S\
S\
S\
S\
4S jjrSrg)FlaxElectraModuleid  r9   r:   Fr  c                 r   [        U R                  U R                  S9U l        U R                  R                  U R                  R
                  :w  a8  [        R                  " U R                  R
                  U R                  S9U l        [        U R                  U R                  U R                  S9U l        g )Nro   r-  )r7   r9   r:   
embeddingsrE   rp   rB   rt   embeddings_projectr+  r  encoderrT   s    r5   rV   FlaxElectraModule.setupi  sv    /4::N;;%%)@)@@&(hht{{/F/Fdjj&YD#)KKtzz$B]B]
r4   Nr!  r   r   r   rX   r   r  r  c                     U R                  XXBU	S9n[        U S5      (       a  U R                  U5      nU R                  UUUU	UUUU
UUS9
$ )Nr[   r~  )r!  rX   r   r   r   r   r  r  )r}  hasattrr~  r  )rU   r]   r`   r^   r_   r!  r   r   r   rX   r   r  r  r}  s                 r5   rc   FlaxElectraModule.__call__q  ss     __|S` % 

 4-..00<J||'"7#9!/!5#  
 	
r4   )r}  r~  r  )NNNFTFFT)r*   r+   r,   r-   r"   r1   r/   rg   r:   r  rh   rV   r   npr0   rc   r3   r)   r4   r5   r{  r{  d  s    {{E399"#(D(
 +/7;8< ""'%*  
 BJJ' 
  (4 
 !) 5 
  
  
   
 # 
  
  
r4   r{  zaThe bare Electra Model transformer outputting raw hidden-states without any specific head on top.c                       \ rS rSr\rSrg)FlaxElectraModeli  r)   N)r*   r+   r,   r-   r{  rC  r3   r)   r4   r5   r  r    s	    
 %Lr4   r  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
R                  R                  R                  r\S\R"                  4   \S'   S rS rS	rg)
FlaxElectraTiedDensei  rE   r:   N.	bias_initc                 ^    U R                  SU R                  U R                  45      U l        g )Nr   )paramr  rE   r   rT   s    r5   rV   FlaxElectraTiedDense.setup  s#    JJvt~~8K8K7MN	r4   c                 F   [         R                  " XR                  5      n[         R                  " X R                  5      n[        R                  " UUUR
                  S-
  4S4S4U R                  S9n[         R                  " U R                  U R                  5      nX4-   $ )Nr!   r   )r)   r)   )r   )r/   asarrayr:   r   dot_generalndimr   r   )rU   xkernelyr   s        r5   rc   FlaxElectraTiedDense.__call__  sy    KK::&VZZ0OOvvzmT"H-nn	
 {{499djj1xr4   )r   )r*   r+   r,   r-   ru  r1   r/   rg   r:   r   rF   rB   rG   r   r  r   r  r0   rV   rc   r3   r)   r4   r5   r  r    sQ    {{E399"I+.66+>+>+D+DIxRZZ(DO
r4   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r        SS\
S	\
S
\
S\
4S jjrSrg)FlaxElectraForMaskedLMModulei  r9   r:   Fr  c                    [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        U R                  R                  (       a.  [        U R                  R                  U R                  S9U l
        g [        R                  " U R                  R                  U R                  S9U l
        g NrH  r9   r:   ro   r{  r9   r:   r  rB  r3  generator_predictionstie_word_embeddingsr  rD   generator_lm_headrB   rt   rT   s    r5   rV   "FlaxElectraForMaskedLMModule.setup      (;;djjIdId
 &EDKK_c_i_i%j";;**%9$++:P:PX\XbXb%cD"%'XXdkk.D.DDJJ%WD"r4   NrX   r   r  r  c
                    U R                  UUUUUUUUU	S9	n
U
S   nU R                  U5      nU R                  R                  (       a>  U R                   R                  S   S   S   S   nU R                  XR                  5      nOU R                  U5      nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R                  S9$ )	NrX   r   r  r  r   rT  r}  rJ   	embeddingr!   r&   r'   r(   )
rB  r  r9   r  r   r  Tr   r'   r(   )rU   r]   r`   r^   r_   r!  rX   r   r  r  r   r'   prediction_scoresshared_embeddings                 r5   rc   %FlaxElectraForMaskedLMModule.__call__  s     ,,'/!5#  

  
 66}E;;**#||55h?MN_`alm $ 6 67HJ\J\ ] $ 6 67H I%''!"+55!$!//))
 	
r4   rB  r  r  NNNNTFFTr*   r+   r,   r-   r"   r1   r/   rg   r:   r  rh   rV   rc   r3   r)   r4   r5   r  r    sw    {{E399"#(D(X ""'%* '
 '
  '
 #'
 '
 '
r4   r  z5Electra Model with a `language modeling` head on top.c                       \ rS rSr\rSrg)FlaxElectraForMaskedLMi  r)   N)r*   r+   r,   r-   r  rC  r3   r)   r4   r5   r  r    s    /Lr4   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r        SS\
S	\
S
\
S\
4S jjrSrg)FlaxElectraForPreTrainingModulei  r9   r:   Fr  c                     [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        g NrH  r  )r{  r9   r:   r  rB  r9  discriminator_predictionsrT   s    r5   rV   %FlaxElectraForPreTrainingModule.setup  sC    (;;djjIdId
 *MTXT_T_gkgqgq)r&r4   NrX   r   r  r  c
                     U R                  UUUUUUUUU	S9	n
U
S   nU R                  U5      nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R                  S9$ )Nr  r   r!   r  )rB  r  r$   r'   r(   rU   r]   r`   r^   r_   r!  rX   r   r  r  r   r'   r&   s                r5   rc   (FlaxElectraForPreTrainingModule.__call__  s     ,,'/!5#  

  
//>9wqr{**.!//))
 	
r4   )r  rB  r  r  r)   r4   r5   r  r    sw    {{E399"#(D(s ""'%* #
 #
  #
 ##
 #
 #
r4   r  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                       \ rS rSr\rSrg)FlaxElectraForPreTrainingi'  r)   N)r*   r+   r,   r-   r  rC  r3   r)   r4   r5   r  r  '  s	     3Lr4   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxElectraForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
    >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.logits
    ```
rj  )output_typerr  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r        SS\
S	\
S
\
S\
4S jjrSrg)'FlaxElectraForTokenClassificationModuleiN  r9   r:   Fr  c                    [        U R                  U R                  U R                  S9U l        U R                  R
                  b  U R                  R
                  OU R                  R                  n[        R                  " U5      U l	        [        R                  " U R                  R                  U R                  S9U l        g NrH  ro   )r{  r9   r:   r  rB  classifier_dropoutrR   rB   rQ   rS   rt   
num_labels
classifierrU   r  s     r5   rV   -FlaxElectraForTokenClassificationModule.setupS  s    (;;djjIdId

 {{--9 KK**00 	
 zz"45((4;;#9#9Lr4   NrX   r   r  r  c
                     U R                  UUUUUUUUU	S9	n
U
S   nU R                  XS9nU R                  U5      nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R
                  S9$ Nr  r   r[   r!   r  )rB  rS   r  r   r'   r(   r  s                r5   rc   0FlaxElectraForTokenClassificationModule.__call___  s     ,,'/!5#  

  
]P/9wqr{**(!//))
 	
r4   )r  rS   rB  r  r  r)   r4   r5   r  r  N  sw    {{E399"#(D(
M ""'%* $
 $
  $
 #$
 $
 $
r4   r  z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                       \ rS rSr\rSrg)!FlaxElectraForTokenClassificationi  r)   N)r*   r+   r,   r-   r  rC  r3   r)   r4   r5   r  r    s	     ;Lr4   r  c                     U $ r   r)   )r  rK  s     r5   identityr    s    Hr4   c                   r    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S
S\4S jjrS	rg)FlaxElectraSequenceSummaryi  a  
Compute a single vector summary of a sequence hidden states.

Args:
    config ([`PretrainedConfig`]):
        The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
        config class of your model for the default values it uses):

        - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
        - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
          (otherwise to `config.hidden_size`).
        - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
          another string or `None` will add no activation.
        - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
        - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
r9   r:   c                    [         U l        [        U R                  S5      (       a  U R                  R                  (       a  [        U R                  S5      (       aL  U R                  R
                  (       a1  U R                  R                  S:  a  U R                  R                  nOU R                  R                  n[        R                  " XR                  S9U l        [        U R                  SS 5      nU(       a	  [        U   OS U l        [         U l        [        U R                  S5      (       aI  U R                  R                  S:  a/  [        R                   " U R                  R                  5      U l        [         U l        [        U R                  S5      (       aK  U R                  R$                  S:  a0  [        R                   " U R                  R$                  5      U l        g g g )	Nsummary_use_projsummary_proj_to_labelsr   ro   summary_activationc                     U $ r   r)   )r  s    r5   r   2FlaxElectraSequenceSummary.setup.<locals>.<lambda>  s    XYr4   summary_first_dropoutsummary_last_dropout)r  summaryr  r9   r  r  r  rp   rB   rt   r:   getattrr   r   first_dropoutr  rQ   last_dropoutr  )rU   num_classesactivation_strings      r5   rV    FlaxElectraSequenceSummary.setup  sF   4;; 2338T8T%=>>KK66KK**Q."kk44"kk5588KzzBDL#DKK1EtL7H&!23k%4;; 788T[[=^=^ab=b!#DKK,M,M!ND$4;; 677DKK<\<\_`<` "

4;;+K+K LD =a7r4   NrX   c                     USS2S4   nU R                  XCS9nU R                  U5      nU R                  U5      nU R                  XCS9nU$ )a  
Compute a single vector summary of a sequence hidden states.

Args:
    hidden_states (`jnp.ndarray` of shape `[batch_size, seq_len, hidden_size]`):
        The hidden states of the last layer.
    cls_index (`jnp.ndarray` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
        Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

Returns:
    `jnp.ndarray`: The summary of the sequence hidden states.
Nr   r[   )r  r  r   r  )rU   r'   	cls_indexrX   r   s        r5   rc   #FlaxElectraSequenceSummary.__call__  sY     q!t$##F#Hf%(""6"Gr4   )r   r  r  r  )NTrf   r)   r4   r5   r  r    s8    " {{E399"M0T  r4   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r        SS\
S	\
S
\
S\
4S jjrSrg)"FlaxElectraForMultipleChoiceModulei  r9   r:   Fr  c                     [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        [        R                  " SU R                  S9U l	        g )NrH  r  r!   ro   )
r{  r9   r:   r  rB  r  sequence_summaryrB   rt   r  rT   s    r5   rV   (FlaxElectraForMultipleChoiceModule.setup  sU    (;;djjIdId
 !;$++UYU_U_ `((1DJJ7r4   NrX   r   r  r  c
                 :   UR                   S   n
Ub  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nU R                  UUUUUUUUU	S9	nUS   nU R                  XS9nU R	                  U5      nUR                  SU
5      nU	(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )Nr!   r   r  r   r[   r  )r   r~   rB  r  r  r   r'   r(   )rU   r]   r`   r^   r_   r!  rX   r   r  r  num_choicesr   r'   pooled_outputr&   reshaped_logitss                   r5   rc   +FlaxElectraForMultipleChoiceModule.__call__  sI     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ,,'/!5#  

  
--m-Y/ ..[9#%33,"!//))
 	
r4   )r  rB  r  r  r  r)   r4   r5   r  r    sv    {{E399"#(D(8 ""'%* +
 +
  +
 #+
 +
 +
r4   r  z
    ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       \ rS rSr\rSrg)FlaxElectraForMultipleChoicei  r)   N)r*   r+   r,   r-   r  rC  r3   r)   r4   r5   r  r    s	     6Lr4   r  z(batch_size, num_choices, sequence_lengthc            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r        SS\
S	\
S
\
S\
4S jjrSrg)%FlaxElectraForQuestionAnsweringModulei2  r9   r:   Fr  c                     [        U R                  U R                  U R                  S9U l        [
        R                  " U R                  R                  U R                  S9U l        g r  )	r{  r9   r:   r  rB  rB   rt   r  
qa_outputsrT   s    r5   rV   +FlaxElectraForQuestionAnsweringModule.setup7  sE    (;;djjIdId
 ((4;;#9#9Lr4   NrX   r   r  r  c
                 V   U R                  UUUUUUUUU	S9	n
U
S   nU R                  U5      n[        R                  " XR                  R
                  SS9u  pUR                  S5      nUR                  S5      nU	(       d	  X4U
SS  -   $ [        UUU
R                  U
R                  S9$ )Nr  r   r   r   r!   )start_logits
end_logitsr'   r(   )
rB  r  r/   r[  r9   r  r>  r   r'   r(   )rU   r]   r`   r^   r_   r!  rX   r   r  r  r   r'   r&   r  r  s                  r5   rc   .FlaxElectraForQuestionAnsweringModule.__call__=  s     ,,'/!5#  

  
/#&99V[[5K5KRT#U #++B/''+
 -;;/%!!//))	
 	
r4   )rB  r  r  r  r)   r4   r5   r  r  2  sw    {{E399"#(D(M ""'%* &
 &
  &
 #&
 &
 &
r4   r  z
    ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       \ rS rSr\rSrg)FlaxElectraForQuestionAnsweringif  r)   N)r*   r+   r,   r-   r  rC  r3   r)   r4   r5   r  r  f  s	     9Lr4   r  c                   r    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S
S\4S jjrSrg	)FlaxElectraClassificationHeadiy  z-Head for sentence-level classification tasks.r9   r:   c                    [         R                  " U R                  R                  U R                  S9U l        U R                  R                  b  U R                  R                  OU R                  R                  n[         R                  " U5      U l	        [         R                  " U R                  R                  U R                  S9U l        g )Nro   )rB   rt   r9   rp   r:   r   r  rR   rQ   rS   r  out_projr  s     r5   rV   #FlaxElectraClassificationHead.setup  s    XXdkk55TZZH
 {{--9 KK**00 	
 zz"45!7!7tzzJr4   rX   c                     US S 2SS S 24   nU R                  X2S9nU R                  U5      n[        S   " U5      nU R                  X2S9nU R                  U5      nU$ )Nr   r[   gelu)rS   r   r   r  )rU   r'   rX   r  s       r5   rc   &FlaxElectraClassificationHead.__call__  s`    !Q'"LLL8JJqM6N1LLL8MM!r4   )r   rS   r  Nre   rf   r)   r4   r5   r  r  y  s4    7{{E399"KT  r4   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r        SS\
S	\
S
\
S\
4S jjrSrg)*FlaxElectraForSequenceClassificationModulei  r9   r:   Fr  c                     [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        g r  )r{  r9   r:   r  rB  r  r  rT   s    r5   rV   0FlaxElectraForSequenceClassificationModule.setup  s>    (;;djjIdId
 8t{{RVR\R\]r4   NrX   r   r  r  c
                     U R                  UUUUUUUUU	S9	n
U
S   nU R                  XS9nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R                  S9$ r  )rB  r  r   r'   r(   r  s                r5   rc   3FlaxElectraForSequenceClassificationModule.__call__  s     ,,'/!5#  

  
L9wqr{**+!//))
 	
r4   )r  rB  r  r  r)   r4   r5   r  r    sw    {{E399"#(D(^ ""'%* "
 "
  "
 #"
 "
 "
r4   r  z
    Electra Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       \ rS rSr\rSrg)$FlaxElectraForSequenceClassificationi  r)   N)r*   r+   r,   r-   r  rC  r3   r)   r4   r5   r  r    s	     >Lr4   r  c                   V   \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r           SS\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\
S\
S\
S\
S\
4S jjrSrg)FlaxElectraForCausalLMModulei  r9   r:   Fr  c                    [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        U R                  R                  (       a.  [        U R                  R                  U R                  S9U l
        g [        R                  " U R                  R                  U R                  S9U l
        g r  r  rT   s    r5   rV   "FlaxElectraForCausalLMModule.setup  r  r4   Nr`   r^   r_   r!  r   r   r   rX   r   r  r  c                    U R                  UUUUUUUUU	U
UUS9nUS   nU R                  U5      nU R                  R                  (       a?  U R                   R                  S   S   S   S   nU R                  UUR                  5      nOU R                  U5      nU(       d	  U4USS  -   $ [        UUR                  UR                  UR                  S9$ )	N)r   r   r   rX   r   r  r  r   rT  r}  rJ   r  r!   )r&   r'   r(   r  )rB  r  r9   r  r   r  r  r   r'   r(   r  )rU   r]   r`   r^   r_   r!  r   r   r   rX   r   r  r  r   r'   r  r  s                    r5   rc   %FlaxElectraForCausalLMModule.__call__  s     ,,"7#9!'/!5#  
  
 66}E;;**#||55h?MN_`alm $ 6 67HJZJ\J\ ] $ 6 67H I%''!"+554$!//))$55	
 	
r4   r  )NNNNNNFTFFTr)  r)   r4   r5   r  r    s    {{E399"#(D(X 1504.2+/7;8< ""'%* .
 !-.
 !-	.

 s{{+.
 CKK(.
  (4.
 !) 5.
 .
 .
  .
 #.
 .
 .
r4   r  z
    Electra Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   J    \ rS rSr\rSS\\R                     4S jjr	S r
Srg)FlaxElectraForCausalLMi  Nr`   c                 6   UR                   u  pEU R                  XB5      n[        R                  " XB4SS9nUb*  UR	                  SS9S-
  n[
        R                  " XsS5      nO2[        R                  " [        R                  " USS9S S S 24   XE45      nUUUS.$ )NrZ   ro   r   r   r!   )r   r   )rl  r`   r_   )	r   r   r/   rx   cumsumr   r   r   r   )	rU   r]   r   r`   r   
seq_lengthrl  extended_attention_maskr_   s	            r5   prepare_inputs_for_generation4FlaxElectraForCausalLM.prepare_inputs_for_generation!  s    !*
//*A #&((J+C4"P%)00b09A=L&)&>&>?Vhn&o#++CJJz,NtUVw,WZdYqrL  /5(
 	
r4   c                 L    UR                   US'   US   S S 2SS 24   S-   US'   U$ )Nrl  r_   r   r!   )rl  )rU   model_outputsmodel_kwargss      r5   update_inputs_for_generation3FlaxElectraForCausalLM.update_inputs_for_generation6  s8    *7*G*G&''3N'CArsF'Ka'O^$r4   r)   r   )r*   r+   r,   r-   r  rC  r   rF   Arrayr  r  r3   r)   r4   r5   r  r    s'     0L
S[\_\e\eSf 
*r4   r  )	r  r  r  r  r  r  r  r  rA  )_typingr   r   flax
flax.linenlinenrB   rF   	jax.numpynumpyr/   r  flax.core.frozen_dictr   r   r   r   r	   r
   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r   r    configuration_electrar"   
get_loggerr*   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr  struct	dataclassr$   ELECTRA_START_DOCSTRINGrv  rt  r7   rj   r   r   r   r   r   r  r+  r3  r9  rA  r{  r  r  r  r  r  r  &FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRINGrw  r  r  r  r  r  r  r  r  r  r  r  r  r  __all__r)   r4   r5   <module>r/     s    &   
   > > 6 6 > ; 	 	 	  g f 0 
		H	%: ! 4k 4 42 ,$ N&BII &Thryy hXBII *'299 'Vbii &		 *6ryy 6tO
 O
f$
 $
Nbii ")) "}!4 }@-
		 -
` g%1 %	% -/BDWYh i299 ,6
299 6
r QSjk07 0 l0 35HJ\^m n.
bii .
b 
 3 : 33* &$ ##$ABEkk !+JYh
5
bii 5
p 
 ;(B ;; %	@ @F7
 7
t  6#= 66
  ":"A"ABl"m  !	1
BII 1
h  9&@ 99 #$	BII 4-
 -
`  >+E >> ( 	=
299 =
@  7 < )	
r4   