
    bCi                        S SK JrJr  S SKrS SKJr  S SKrS SKJ	r
  S SK	rS SKJrJrJr  S SKJrJr  S SKJr  S SKJr  S SKJrJr  S SKJr  S	S
KJrJrJrJrJrJ r J!r!J"r"J#r#J$r$  S	SK%J&r&J'r'J(r(J)r)J*r*  S	SK+J,r,J-r-J.r.J/r/  SSK0J1r1  \/Rd                  " \35      r4Sr5Sr6\Rn                  r7\Rp                  Rr                   " S S\,5      5       r:Sr;Sr< " S S\Rz                  5      r> " S S\Rz                  5      r? " S S\Rz                  5      r@ " S S\Rz                  5      rA " S S\Rz                  5      rB " S S \Rz                  5      rC " S! S"\Rz                  5      rD " S# S$\Rz                  5      rE " S% S&\Rz                  5      rF " S' S(\Rz                  5      rG " S) S*\Rz                  5      rH " S+ S,\Rz                  5      rI " S- S.\Rz                  5      rJ " S/ S0\Rz                  5      rK " S1 S2\Rz                  5      rL " S3 S4\'5      rM " S5 S6\Rz                  5      rN\-" S7\;5       " S8 S9\M5      5       rO\(" \O\5\\65         " S: S;\Rz                  5      rP\-" S<\;5       " S= S>\M5      5       rQS?rR\*" \Q\<R                  S@5      \R-   5        \)" \Q\:\6SA9   " SB SC\Rz                  5      rT\-" SD\;5       " SE SF\M5      5       rU\(" \U\5\\65         " SG SH\Rz                  5      rV\-" SI\;5       " SJ SK\M5      5       rWSLrX\*" \W\<R                  S@5      \X-   5        \)" \W\!\6SA9   " SM SN\Rz                  5      rY\-" SO\;5       " SP SQ\M5      5       rZ\(" \Z\5\#\65         " SR SS\Rz                  5      r[\-" ST\;5       " SU SV\M5      5       r\\*" \\\<R                  SW5      5        \(" \\\5\ \65         " SX SY\Rz                  5      r]\-" SZ\;5       " S[ S\\M5      5       r^\(" \^\5\$\65         " S] S^\Rz                  5      r_\-" S_\;5       " S` Sa\M5      5       r`\(" \`\5\"\65         " Sb Sc\Rz                  5      ra\-" Sd\;5       " Se Sf\M5      5       rb\(" \b\5\\65        / SgQrcg)h    )CallableOptionalN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )
-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutputFlaxNextSentencePredictorOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )
BertConfigzgoogle-bert/bert-base-uncasedr$   c                       \ rS rSr% SrSr\R                  \S'   Sr	\R                  \S'   Sr
\\\R                        \S'   Sr\\\R                        \S'   Srg)	FlaxBertForPreTrainingOutput=   a  
Output type of [`BertForPreTraining`].

Args:
    prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nprediction_logitsseq_relationship_logitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r(   jnpndarray__annotations__r)   r*   r   tupler+   __static_attributes__r,       e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/bert/modeling_flax_bert.pyr&   r&   =   sW    , &*s{{)+/S[[/26M8E#++./6/3Js{{+,3r7   r&   a
  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].

a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   r    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S
S\4S jjrSrg	)FlaxBertEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9U R                  S9U l
        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9U R                  S9U l        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9U R                  S9U l        [         R                  " U R                  R                   U R                  S9U l        [         R"                  " U R                  R$                  S9U l        g )N)stddev)embedding_initr=   epsilonr=   rate)nnEmbedr<   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger=   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfs    r8   setupFlaxBertEmbeddings.setup   sJ   !xxKK""KK##66..55T[[=Z=Z5[**	 
 $&88KK//KK##66..55T[[=Z=Z5[**	$
  &(XXKK''KK##66..55T[[=Z=Z5[**	&
" dkk.H.HPTPZPZ[zzt{{'F'FGr7   deterministicc                    U R                  UR                  S5      5      nU R                  UR                  S5      5      nU R                  UR                  S5      5      nXh-   U-   n	U R	                  U	5      n	U R                  XS9n	U	$ )Ni4r[   )rM   astyperO   rQ   rR   rV   )
rX   	input_idstoken_type_idsposition_idsattention_maskr[   inputs_embedsposition_embedsrQ   r*   s
             r8   __call__FlaxBertEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &=O }5]Pr7   )rR   rV   rO   rQ   rM   NT)r-   r.   r/   r0   r1   r$   r4   r2   float32r=   rY   boolrf   r6   r,   r7   r8   r:   r:      s5    Q{{E399"H,_c  r7   r:   c                       \ rS rSr% \\S'   Sr\\S'   \R                  r
\R                  \S'   S rS rS r\R                  S	 5       r    SS\\R$                     S\S\4S jjrSrg
)FlaxBertSelfAttention   r<   Fcausalr=   c                 ,   U R                   R                  U R                   R                  -  U l        U R                   R                  U R                   R                  -  S:w  a  [	        S5      e[
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        [
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        [
        R                  " U R                   R                  U R                  [        R
                  R                  R                  U R                   R                  5      S9U l        U R                  (       a9  [!        ["        R$                  " SU R                   R&                  4SS9SS9U l        g g )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r=   kernel_initr#   rj   r=   )r<   rH   num_attention_headshead_dim
ValueErrorrE   Denser=   rI   rJ   rK   rL   querykeyvaluern   r	   r2   onesrN   causal_maskrW   s    r8   rY   FlaxBertSelfAttention.setup   si   //4;;3R3RR;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ

 ;;/!T[[@@APX^ D r7   c                     UR                  UR                  S S U R                  R                  U R                  4-   5      $ N   )reshapeshaper<   rr   rs   rX   r*   s     r8   _split_heads"FlaxBertSelfAttention._split_heads   s;    $$]%8%8!%<@_@_aeanan?o%oppr7   c                 n    UR                  UR                  S S U R                  R                  4-   5      $ r}   )r   r   r<   rH   r   s     r8   _merge_heads"FlaxBertSelfAttention._merge_heads  s2    $$]%8%8!%<@W@W?Y%YZZr7   c                 (   U R                  SS5      nU R                  SS[        R                  UR                  UR
                  5      nU R                  SS[        R                  UR                  UR
                  5      nU R                  SSS 5      nU(       a  UR                  R                  Gt ppUR                  nS[        U	5      -  USS4-   n[        R                  " UR                  X5      n[        R                  " UR                  X.5      nXl        X'l        UR                  S   nUR                  U-   Ul        [        R                  " [        R                  " U
5      X-   :  [        U	5      SX4-   5      n[        UU5      nXU4$ )	a<  
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slightly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
cache
cached_keycached_valuecache_indexc                  H    [         R                  " S[         R                  S9$ )Nr   rq   )r2   arrayint32r,   r7   r8   <lambda>=FlaxBertSelfAttention._concatenate_to_cache.<locals>.<lambda>  s    CIIaWZW`W`Dar7   )r   r   r#   )has_variablevariabler2   zerosr   r=   rx   lenr   dynamic_update_slicebroadcast_toaranger5   r   )rX   rw   rx   rv   rc   is_initializedr   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r8   _concatenate_to_cache+FlaxBertSelfAttention._concatenate_to_cache  s_    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G**:+;+;SJC,,\-?-?PE"!&(-A% + 1 14M MK''

:&)NNj!Q(A$NNH +8^DN>))r7   Nkey_value_states
init_cacheoutput_attentionsc                    US LnUR                   S   n	U R                  U5      n
U(       a#  U R                  U5      nU R                  U5      nO"U R                  U5      nU R                  U5      nU R	                  U
5      n
U R	                  U5      nU R	                  U5      nU R
                  (       a  U
R                   S   UR                   S   pU R                  SS5      (       a\  U R                  S   S   nU R                  S   S   R                   S   n[        R                  " U R                  SSUS4SSUU45      nOU R                  S S 2S S 2S U2S U24   n[        R                  " UU	4UR                   SS  -   5      nUbR  U R
                  (       aA  [        R                  " [        R                  " USS9WR                   5      n[        UU5      nO,U R
                  (       a  WnOUb  [        R                  " USS9nU R
                  (       a3  U R                  SS5      (       d  U(       a  U R                  XX5      u  pnUb  [        R                   " US:  [        R"                  " UR                   S5      R%                  U R&                  5      [        R"                  " UR                   [        R(                  " U R&                  5      R*                  5      R%                  U R&                  5      5      nOS nS nU(       d+  U R,                  R.                  S:  a  U R1                  S	5      n[3        U
UUUU R,                  R.                  S
UU R&                  S S9	nUb  [        R4                  " SUU5      n[        R4                  " SUU5      nUR7                  UR                   S S S-   5      nU(       a  UU4nU$ U4nU$ )Nr   r#   r   r   r   )axisg        rV   T)biasdropout_rngdropout_ratebroadcast_dropoutr[   r=   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdr~   ))r   rv   rw   rx   r   rn   r   	variablesr   dynamic_slicerz   r2   r   expand_dimsr   r   selectfullr_   r=   finfominr<   attention_probs_dropout_probmake_rngr   einsumr   )rX   r*   rc   layer_head_maskr   r   r[   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrz   attention_biasr   attn_weightsattn_outputoutputss                          r8   rf   FlaxBertSelfAttention.__call__&  si    .T9"((+
 zz-0"23J::&67L -0J::m4L((6&&z2
((6 ;;'3'9'9!'<j>N>Nq>Q*  ,77!^^G4]C
%)^^G%<\%J%P%PQR%S"!//$$q!Z&;aLRd=e #..q!]l]KZK/OP**;HYHYZ[Z\H]8]^K %$++ --coonS[.\^i^o^opN*>;GN[[(N' __^(KN ;;D--g|DD
7;7Q7Q,84Jn
 % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 &::&8,XLjj!8,U!))+*;*;BQ*?%*GH1B;- JUr7   )rz   rs   rw   rv   rx   NFTF)r-   r.   r/   r0   r$   r4   rn   rj   r2   ri   r=   rY   r   r   rE   compactr   r   r3   rf   r6   r,   r7   r8   rl   rl      s    FD{{E399":q[ ZZ* *H 37 "'_
 #3;;/_ _  _ _r7   rl   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxBertSelfOutputi  r<   r=   c                    [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [         R                  " U R                  R                  U R                  S9U l
        [         R                  " U R                  R                  S9U l        g )Nrp   r=   rA   rC   )rE   ru   r<   rH   rI   rJ   rK   rL   r=   denserR   rS   rT   rU   rV   rW   s    r8   rY   FlaxBertSelfOutput.setup  s    XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr7   r[   c                 l    U R                  U5      nU R                  XS9nU R                  X-   5      nU$ Nr^   r   rV   rR   )rX   r*   input_tensorr[   s       r8   rf   FlaxBertSelfOutput.__call__  s7    

=1]P}'CDr7   rR   r   rV   Nrh   r-   r.   r/   r0   r$   r4   r2   ri   r=   rY   rj   rf   r6   r,   r7   r8   r   r     s1    {{E399"H4  r7   r   c                       \ rS rSr% \\S'   Sr\\S'   \R                  r
\R                  \S'   S r    SS\4S	 jjrS
rg)FlaxBertAttentioni  r<   Frn   r=   c                     [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        g )Nrn   r=   rq   )rl   r<   rn   r=   rX   r   outputrW   s    r8   rY   FlaxBertAttention.setup  s7    )$++dkkQUQ[Q[\	(DJJGr7   Nr   c           
      ~    U R                  UUUUUUUS9nUS   n	U R                  XUS9nU4n
U(       a  XS   4-  n
U
$ )N)r   r   r   r[   r   r   r^   r#   )rX   r   )rX   r*   rc   r   r   r   r[   r   attn_outputsr   r   s              r8   rf   FlaxBertAttention.__call__  sh     yy+-!'/ ! 
 #1oKm\ "Q))Gr7   )r   rX   r   )r-   r.   r/   r0   r$   r4   rn   rj   r2   ri   r=   rY   rf   r6   r,   r7   r8   r   r     sL    FD{{E399"H "'   r7   r   c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxBertIntermediatei  r<   r=   c                 0   [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [        U R                  R                     U l        g Nr   )rE   ru   r<   intermediate_sizerI   rJ   rK   rL   r=   r   r   
hidden_act
activationrW   s    r8   rY   FlaxBertIntermediate.setup  s`    XXKK))++224;;3P3PQ**


 !!7!78r7   c                 J    U R                  U5      nU R                  U5      nU$ N)r   r   r   s     r8   rf   FlaxBertIntermediate.__call__  s$    

=16r7   )r   r   Nr-   r.   r/   r0   r$   r4   r2   ri   r=   rY   rf   r6   r,   r7   r8   r   r     s$    {{E399"9r7   r   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxBertOutputi  r<   r=   c                    [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [         R                  " U R                  R                  S9U l        [         R                  " U R                  R                  U R                  S9U l        g )Nr   rC   rA   )rE   ru   r<   rH   rI   rJ   rK   rL   r=   r   rT   rU   rV   rR   rS   rW   s    r8   rY   FlaxBertOutput.setup  s    XXKK##++224;;3P3PQ**


 zzt{{'F'FGdkk.H.HPTPZPZ[r7   r[   c                 l    U R                  U5      nU R                  XS9nU R                  X-   5      nU$ r   r   )rX   r*   attention_outputr[   s       r8   rf   FlaxBertOutput.__call__  s7    

=1]P}'GHr7   r   Nrh   r   r,   r7   r8   r   r     s1    {{E399"\t  r7   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   S r	     SS\
\R                     S\
\R                     S\S	\S
\4
S jjrSrg)FlaxBertLayeri  r<   r=   c                    [        U R                  U R                  R                  U R                  S9U l        [        U R                  U R                  S9U l        [        U R                  U R                  S9U l        U R                  R                  (       a%  [        U R                  SU R                  S9U l
        g g )Nr   rq   F)r   r<   
is_decoderr=   	attentionr   intermediater   r   add_cross_attentioncrossattentionrW   s    r8   rY   FlaxBertLayer.setup  s    *4;;t{{?U?U]a]g]gh0DJJO$T[[

C;;**"3DKKUYU_U_"`D +r7   Nencoder_hidden_statesencoder_attention_maskr   r[   r   c	           	          U R                  UUUUUUS9n	U	S   n
Ub  U R                  U
UUUUUS9nUS   n
U R                  U
5      nU R                  XUS9nU4nU(       a  XS   4-  nUb	  UWS   4-  nU$ )N)r   r   r[   r   r   )rc   r   r   r[   r   r^   r#   r   r   r   r   )rX   r*   rc   r   r   r   r   r[   r   attention_outputsr   cross_attention_outputsr   s                r8   rf   FlaxBertLayer.__call__  s     !NN+!'/ + 
 -Q/ !,&*&9&9 5 /!6+"3 ': '#  7q9))*:;MS`a "!,..G$03A688r7   r  )NNFTF)r-   r.   r/   r0   r$   r4   r2   ri   r=   rY   r   r3   rj   rf   r6   r,   r7   r8   r   r     s    {{E399"a 8<8< ""'+
  (4+ !) 5+ + +  + +r7   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r       SS\\R                     S	\\R                     S
\
S\
S\
S\
S\
4S jjrSrg)FlaxBertLayerCollectioni%  r<   r=   Fgradient_checkpointingc           	         U R                   (       ag  [        [        SS9n[        U R                  R
                  5       Vs/ s H(  nU" U R                  [        U5      U R                  S9PM*     snU l        g [        U R                  R
                  5       Vs/ s H+  n[        U R                  [        U5      U R                  S9PM-     snU l        g s  snf s  snf )N)         )static_argnums)namer=   )	r  rematr   ranger<   num_hidden_layersstrr=   layers)rX   FlaxBertCheckpointLayeris      r8   rY   FlaxBertLayerCollection.setup*  s    &&&+M)&T# t{{<<==A (#a&

S=DK TYY]YdYdYvYvSwSwadkkAdjjISwDK
s   /C2CNr   r   r   r[   r   output_hidden_statesreturn_dictc                 4   U(       a  SOS nU	(       a  SOS nU(       a  Ub  SOS nUbX  UR                   S   [        U R                  5      :w  a2  [        S[        U R                  5       SUR                   S    S35      e[	        U R                  5       HL  u  pU	(       a  X4-  nU" UUUb  X>   OS UUUUU5      nUS   nU(       d  M5  UUS   4-  nUc  MC  UUS   4-  nMN     U	(       a  X4-  nXX4nU
(       d  [        S U 5       5      $ [        UUUUS	9$ )
Nr,   r   z&The head_mask should be specified for z/ layers, but it is for                         .r#   r~   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r,   ).0vs     r8   	<genexpr>3FlaxBertLayerCollection.__call__.<locals>.<genexpr>l  s     =GqGs   	)last_hidden_stater*   r+   cross_attentions)r   r   r  rt   	enumerater5   r   )rX   r*   rc   	head_maskr   r   r   r[   r   r  r  all_attentionsall_hidden_statesall_cross_attentionsr  layerlayer_outputsr   s                     r8   rf    FlaxBertLayerCollection.__call__6  sZ     1d"6BD&7<Q<]rdh  q!c$++&67 <S=M<N O'ooa014 
 "$++.HA#!%55!! ) 5	4%&!	M *!,M  =#3"55(4(]1-=,??(+ /.  !11 ^Z=G===<++%1	
 	
r7   )r  NNFTFFTr-   r.   r/   r0   r$   r4   r2   ri   r=   r  rj   rY   r   r3   rf   r6   r,   r7   r8   r  r  %  s    {{E399"#(D(
" 8<8< ""'%* =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
 =
r7   r  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r       SS\\R                     S	\\R                     S
\
S\
S\
S\
S\
4S jjrSrg)FlaxBertEncoderiv  r<   r=   Fr  c                 `    [        U R                  U R                  U R                  S9U l        g )Nr=   r  )r  r<   r=   r  r&  rW   s    r8   rY   FlaxBertEncoder.setup{  s%    ,KK**#'#>#>

r7   Nr   r   r   r[   r   r  r  c                 2    U R                  UUUUUUUUU	U
S9
$ )N)r"  r   r   r   r[   r   r  r  r&  )rX   r*   rc   r"  r   r   r   r[   r   r  r  s              r8   rf   FlaxBertEncoder.__call__  s8     zz"7#9!'/!5#  
 	
r7   r1  r)  r*  r,   r7   r8   r,  r,  v  s    {{E399"#(D(
 8<8< ""'%* 

  (4
 !) 5
 
 
  
 #
 
 
r7   r,  c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxBertPooleri  r<   r=   c                     [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        g r   )
rE   ru   r<   rH   rI   rJ   rK   rL   r=   r   rW   s    r8   rY   FlaxBertPooler.setup  sH    XXKK##++224;;3P3PQ**

r7   c                 b    US S 2S4   nU R                  U5      n[        R                  " U5      $ )Nr   )r   rE   tanh)rX   r*   cls_hidden_states      r8   rf   FlaxBertPooler.__call__  s1    (A.::&67ww'((r7   )r   Nr   r,   r7   r8   r4  r4    s$    {{E399"
)r7   r4  c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxBertPredictionHeadTransformi  r<   r=   c                 (   [         R                  " U R                  R                  U R                  S9U l        [        U R                  R                     U l        [         R                  " U R                  R                  U R                  S9U l	        g )Nrq   rA   )rE   ru   r<   rH   r=   r   r   r   r   rR   rS   rW   s    r8   rY   %FlaxBertPredictionHeadTransform.setup  s[    XXdkk55TZZH
 !7!78dkk.H.HPTPZPZ[r7   c                 h    U R                  U5      nU R                  U5      nU R                  U5      $ r   )r   r   rR   r   s     r8   rf   (FlaxBertPredictionHeadTransform.__call__  s-    

=16~~m,,r7   )rR   r   r   Nr   r,   r7   r8   r<  r<    s%    {{E399"\
-r7   r<  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   \	R                  R                  R                  r\S\R                   4   \S'   S rS
S jrS	rg)FlaxBertLMPredictionHeadi  r<   r=   .	bias_initc                 *   [        U R                  U R                  S9U l        [        R
                  " U R                  R                  U R                  SS9U l        U R                  SU R                  U R                  R                  45      U l
        g )Nrq   F)r=   use_biasr   )r<  r<   r=   	transformrE   ru   rG   decoderparamrC  r   rW   s    r8   rY   FlaxBertLMPredictionHead.setup  s`    8DJJWxx 6 6djjSXYJJvt~~8N8N7PQ	r7   Nc                    U R                  U5      nUb+  U R                  R                  SSUR                  00U5      nOU R                  U5      n[        R
                  " U R                  U R                  5      nX-  nU$ )Nparamskernel)rF  rG  applyTr2   asarrayr   r=   )rX   r*   shared_embeddingr   s       r8   rf   !FlaxBertLMPredictionHead.__call__  sr    }5' LL..8EUEWEW:X/Y[hiM LL7M{{499djj1r7   )r   rG  rF  r   )r-   r.   r/   r0   r$   r4   r2   ri   r=   rI   rE   rJ   r   rC  r   npr3   rY   rf   r6   r,   r7   r8   rB  rB    sL    {{E399"+.66+>+>+D+DIxRZZ(DR

r7   rB  c                   f    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	SS jr
Srg)	FlaxBertOnlyMLMHeadi  r<   r=   c                 J    [        U R                  U R                  S9U l        g )Nrq   )rB  r<   r=   predictionsrW   s    r8   rY   FlaxBertOnlyMLMHead.setup  s    3DKKtzzRr7   Nc                 $    U R                  XS9nU$ NrP  rV  )rX   r*   rP  s      r8   rf   FlaxBertOnlyMLMHead.__call__  s    (((Zr7   r[  r   r   r,   r7   r8   rT  rT    s%    {{E399"Sr7   rT  c                   X    \ rS rSr% \R
                  r\R                  \S'   S rS r	Sr
g)FlaxBertOnlyNSPHeadi  r=   c                 L    [         R                  " SU R                  S9U l        g )Nr~   rq   )rE   ru   r=   seq_relationshiprW   s    r8   rY   FlaxBertOnlyNSPHead.setup  s     "$** =r7   c                 $    U R                  U5      $ r   r`  )rX   pooled_outputs     r8   rf   FlaxBertOnlyNSPHead.__call__  s    $$]33r7   rc  N)r-   r.   r/   r0   r2   ri   r=   r4   rY   rf   r6   r,   r7   r8   r^  r^    s    {{E399">4r7   r^  c                   f    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	SS jr
Srg)	FlaxBertPreTrainingHeadsi  r<   r=   c                     [        U R                  U R                  S9U l        [        R
                  " SU R                  S9U l        g )Nrq   r~   )rB  r<   r=   rV  rE   ru   r`  rW   s    r8   rY   FlaxBertPreTrainingHeads.setup  s0    3DKKtzzR "$** =r7   Nc                 H    U R                  XS9nU R                  U5      nXE4$ rY  rV  r`  )rX   r*   rd  rP  prediction_scoresseq_relationship_scores         r8   rf   !FlaxBertPreTrainingHeads.__call__  s0     ,,],^!%!6!6}!E 88r7   rk  r   r   r,   r7   r8   rg  rg    s$    {{E399">9r7   rg  c                     ^  \ rS rSr% Sr\rSrSr\	R                  \S'   SS\R                  SS	4S
\S\S\S\R                   S\S\4U 4S jjjrS rS S\R*                  R,                  S\S\S\4S jjrS r\" \R9                  S5      5                   S!S\\   S\R*                  R,                  S\S\\   S\\   S\\   S\\   4S jj5       rSr U =r!$ )"FlaxBertPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
bertNmodule_class)r#   r#   r   TFr<   input_shapeseedr=   _do_initr  c           	      P   > U R                   " SUUUS.UD6n[        T	U ]	  XX#XES9  g )Nr<   r=   r  )rs  rt  r=   ru  r,   )rr  super__init__)
rX   r<   rs  rt  r=   ru  r  kwargsmodule	__class__s
            r8   ry   FlaxBertPreTrainedModel.__init__  sD     "" 
#9
 	
 	[SXlr7   c                 X    U R                  U R                  U R                  SS9U l        g )NTrw  )rr  r<   r=   _modulerW   s    r8   enable_gradient_checkpointing5FlaxBertPreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r7   rngrK  returnc                    [         R                  " USS9n[         R                  " U5      n[         R                  " [         R                  " [         R
                  " U5      R                  S   5      U5      n[         R                  " U5      n[         R                  " U R                  R                  U R                  R                  45      n[        R                  R                  U5      u  pXS.nU R                  R                  (       aQ  [         R                  " X R                  R                   4-   5      nUnU R"                  R%                  UUUUUUUUSS9	nOU R"                  R%                  XXuXhSS9nUS   nUbf  ['        [)        U5      5      n['        [)        U5      5      nU R*                   H  nUU   UU'   M     [-        5       U l        [/        [1        U5      5      $ U$ )Nr]   rq   r   )rK  rV   F)r  rK  )r2   r   
zeros_liker   r   
atleast_2dr   	ones_likery   r<   r  rr   rI   randomsplitr   rH   r{  initr   r   _missing_keyssetr   r   )rX   r  rs  rK  r`   ra   rb   rc   r"  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r8   init_weights$FlaxBertPreTrainedModel.init_weights  s   IIk6		2''

3>>)3L3R3RSU3V(WYdey1HHdkk;;T[[=\=\]^	"%**"2"23"7
$=;;**$'IIk[[=T=T<V.V$W!%3""&++"2"2%&! #3 
# #'++"2"2fk #3 # ,H5(-)@AM!(6"23F#11&3K&@{#  2!$D.011  r7   c           	         [         R                  " X4SS9n[         R                  " USS9n[         R                  " [         R                  " [         R
                  " U5      R                  S   5      UR                  5      nU R                  R                  [        R                  R                  S5      X4USSS9n[        US   5      $ )	a  
Args:
    batch_size (`int`):
        batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
    max_length (`int`):
        maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
        cache.
r]   rq   r   r   FT)r  r   r   )r2   ry   r  r   r   r  r   r{  r  rI   r  PRNGKeyr   )rX   r   r   r`   rc   rb   init_variabless          r8   r   "FlaxBertPreTrainedModel.init_cacheF  s     HHj5TB	y=''

3>>)3L3R3RSU3V(WYbYhYhi))JJq!9lX]jn * 
 w/00r7   batch_size, sequence_lengthr   trainr   r  r  past_key_valuesc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [        R
                  " U5      nUcV  [        R                  " [        R                  " [        R                  " U5      R                  S   5      UR                  5      nUc  [        R                  " U5      nUc@  [        R                  " U R                   R                  U R                   R                  45      n0 nU	b  XS'   SU=(       d    U R                  0nU R                   R                  (       a  U(       a	  UUS'   S/nOSnU R                   R#                  U[        R$                  " USS9[        R$                  " USS9[        R$                  " USS9[        R$                  " USS9[        R$                  " USS9UUU
(       + UUUUUS9nUb  U(       a  Uu  nn['        US   5      US	'   U$ Ub'  U(       d   Uu  nnUS S
 ['        US   5      4-   US
S  -   nU$ U R                   R#                  U[        R$                  " USS9[        R$                  " USS9[        R$                  " USS9[        R$                  " USS9[        R$                  " USS9U
(       + UUUUS9nU$ )Nr   rV   rK  r   Fr]   rq   )ra   rb   r"  r   r   r[   r   r  r  r  mutabler  r#   )ra   rb   r"  r[   r   r  r  r  )r<   r   r  r  r2   r  r   r   r  r   r  ry   r  rr   rK  r   r{  rM  r   r   )rX   r`   rc   ra   rb   r"  r   r   rK  r   r  r   r  r  r  r  inputsr  r   s                      r8   rf    FlaxBertPreTrainedModel.__call__Y  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ^^I6N++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N$++"?"?A`A`!abI ")OF1dkk2;;** "1w")kk''		)40		.5"yytD YY|4@))IT:&;'="'i"3%9' ( G$ *{+2(-5og6N-O)* ,[+2(!"1+/'2J)K(MMPWXYXZP[["  kk''		)40		.5"yytD YY|4@))IT:"'i"3%9' ( G r7   )r  r  r   )NNNNNNNNFNNNN)"r-   r.   r/   r0   r1   r$   config_classbase_model_prefixrr  rE   Moduler4   r2   ri   r5   intr=   rj   ry  r  rI   r  r  r   r  r   r!   BERT_INPUTS_DOCSTRINGformatr   dictrf   r6   __classcell__)r|  s   @r8   rp  rp    s}   
 L"L"))"
 $;;',mm m 	m
 yym m !%m m$
(!

 2 2 (! (!PZ (!fp (!V1& ++@+G+GHe+fg "#!%*.,0/3&**.^ ^ ZZ''^ ^ $D>^ 'tn^ d^^ "$^ h^r7   rp  c                   D   \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   Sr\
\S'   S r          SS
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\
S\
S\
S\
S\
4S jjrSrg	)FlaxBertModulei  r<   r=   Tadd_pooling_layerFr  c                     [        U R                  U R                  S9U l        [	        U R                  U R                  U R
                  S9U l        [        U R                  U R                  S9U l        g )Nrq   r.  )	r:   r<   r=   
embeddingsr,  r  encoderr4  poolerrW   s    r8   rY   FlaxBertModule.setup  sS    ,T[[

K&KK**#'#>#>

 %T[[

Cr7   Nra   rb   r"  r   r   r   r[   r   r  r  c                    Uc  [         R                  " U5      nUcV  [         R                  " [         R                  " [         R                  " U5      R
                  S   5      UR
                  5      nU R                  XXBU	S9nU R                  UUUU	UUUU
UUS9
nUS   nU R                  (       a  U R                  U5      OS nU(       d  Uc	  U4USS  -   $ X4USS  -   $ [        UUUR                  UR                  UR                  S9$ )Nr   r^   )r"  r[   r   r   r   r   r  r  r   r#   )r  pooler_outputr*   r+   r   )r2   r  r   r   r  r   r  r  r  r  r   r*   r+   r   )rX   r`   rc   ra   rb   r"  r   r   r   r[   r   r  r  r*   r   pooleds                   r8   rf   FlaxBertModule.__call__  s&     ! ^^I6N ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL|S` ( 
 ,,'"7#9!/!5#  
  
/3/E/E]+4~%''!"+55!*WQR[88?+ !//))$55
 	
r7   )r  r  r  )
NNNNNFTFFT)r-   r.   r/   r0   r$   r4   r2   ri   r=   r  rj   r  rY   r   r3   rf   r6   r,   r7   r8   r  r    s    {{E399""t"#(D(D 15.2+/7;8< ""'%* 5
 !-	5

 s{{+5
 CKK(5
  (45
 !) 55
 5
 5
  5
 #5
 5
 5
r7   r  z^The bare Bert Model transformer outputting raw hidden-states without any specific head on top.c                       \ rS rSr\rSrg)FlaxBertModeli  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r    s	    
 "Lr7   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)FlaxBertForPreTrainingModulei  r<   r=   Fr  c                     [        U R                  U R                  U R                  S9U l        [        U R                  U R                  S9U l        g )Nrw  r<   r=   )r  r<   r=   r  rq  rg  clsrW   s    r8   rY   "FlaxBertForPreTrainingModule.setup  s=    ";;**#'#>#>
	
 ,4;;djjQr7   r[   r   r  r  c
                 H   U R                  UUUUUUUUU	S9	n
U R                  R                  (       a#  U R                   R                  S   S   S   S   nOS nU
S   nU
S   nU R	                  XUS9u  pU	(       d	  X4U
S	S  -   $ [        UUU
R                  U
R                  S
9$ )Nr[   r   r  r  rK  r  rM   	embeddingr   r#   rZ  r~   )r(   r)   r*   r+   )rq  r<   tie_word_embeddingsr   r  r&   r*   r+   )rX   r`   rc   ra   rb   r"  r[   r   r  r  r   rP  r*   rd  rl  rm  s                   r8   rf   %FlaxBertForPreTrainingModule.__call__  s     ))'/!5#  

 ;;**#yy228<\JK\]^ij#

48HH;K 5= 5
1 %>LL+/$:!//))	
 	
r7   rq  r  NTFFTr-   r.   r/   r0   r$   r4   r2   ri   r=   r  rj   rY   rf   r6   r,   r7   r8   r  r    sk    {{E399"#(D(R #"'%* -
 -
  -
 #-
 -
 -
r7   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                       \ rS rSr\rSrg)FlaxBertForPreTrainingiJ  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r  J  s	     0Lr7   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.seq_relationship_logits
    ```
r  )output_typer  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)FlaxBertForMaskedLMModuleiq  r<   r=   Fr  c                     [        U R                  SU R                  U R                  S9U l        [        U R                  U R                  S9U l        g NF)r<   r  r=   r  r  r  r<   r=   r  rq  rT  r  rW   s    r8   rY   FlaxBertForMaskedLMModule.setupv  @    ";;#**#'#>#>	
	 'dkkLr7   r[   r   r  r  c
                 6   U R                  UUUUUUUUU	S9	n
U
S   nU R                  R                  (       a#  U R                   R                  S   S   S   S   nOS nU R	                  XS9nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R                  S	9$ )
Nr  r   rK  r  rM   r  rZ  r#   logitsr*   r+   )rq  r<   r  r   r  r   r*   r+   )rX   r`   rc   ra   rb   r"  r[   r   r  r  r   r*   rP  r  s                 r8   rf   "FlaxBertForMaskedLMModule.__call__  s     ))'/!5#  

  
;;**#yy228<\JK\]^ij# -K9wqr{**!!//))
 	
r7   r  Nr  r  r,   r7   r8   r  r  q  sk    {{E399"#(D(M  #"'%* )
 )
  )
 #)
 )
 )
r7   r  z2Bert Model with a `language modeling` head on top.c                       \ rS rSr\rSrg)FlaxBertForMaskedLMi  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r    s    ,Lr7   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)'FlaxBertForNextSentencePredictionModulei  r<   r=   Fr  c                     [        U R                  U R                  U R                  S9U l        [        U R                  S9U l        g )Nrw  rq   )r  r<   r=   r  rq  r^  r  rW   s    r8   rY   -FlaxBertForNextSentencePredictionModule.setup  s7    ";;**#'#>#>
	
 'TZZ8r7   r[   r   r  r  c
                     U	b  U	OU R                   R                  n	U R                  UUUUUUUUU	S9	n
U
S   nU R                  U5      nU	(       d	  U4U
SS  -   $ [	        UU
R
                  U
R                  S9$ )Nr  r#   r~   r  )r<   r  rq  r  r   r*   r+   )rX   r`   rc   ra   rb   r"  r[   r   r  r  r   rd  seq_relationship_scoress                r8   rf   0FlaxBertForNextSentencePredictionModule.__call__  s     &1%<k$++BYBY ))'/!5#  

  
"&((="9+-;;.*!//))
 	
r7   r  Nr  r  r,   r7   r8   r  r    sj    {{E399"#(D(9 #"'%* %
 %
  %
 #%
 %
 %
r7   r  zJBert Model with a `next sentence prediction (classification)` head on top.c                       \ rS rSr\rSrg)!FlaxBertForNextSentencePredictioni  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r    s	    
 ;Lr7   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax")

    >>> outputs = model(**encoding)
    >>> logits = outputs.logits
    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
    ```
c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)'FlaxBertForSequenceClassificationModulei  r<   r=   Fr  c                    [        U R                  U R                  U R                  S9U l        U R                  R
                  b  U R                  R
                  OU R                  R                  n[        R                  " US9U l	        [        R                  " U R                  R                  U R                  S9U l        g )Nrw  rC   rq   r  r<   r=   r  rq  classifier_dropoutrU   rE   rT   rV   ru   
num_labels
classifierrX   r  s     r8   rY   -FlaxBertForSequenceClassificationModule.setup  s    ";;**#'#>#>
	 {{--9 KK**00 	
 zz'9:((KK""**
r7   r[   r   r  r  c
                     U R                  UUUUUUUUU	S9	n
U
S   nU R                  XS9nU R                  U5      nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R
                  S9$ )Nr  r#   r^   r~   r  )rq  rV   r  r   r*   r+   )rX   r`   rc   ra   rb   r"  r[   r   r  r  r   rd  r  s                r8   rf   0FlaxBertForSequenceClassificationModule.__call__%  s     ))'/!5#  

  
]P/9wqr{**+!//))
 	
r7   rq  r  rV   Nr  r  r,   r7   r8   r  r    sj    {{E399"#(D(
0 #"'%* $
 $
  $
 #$
 $
 $
r7   r  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       \ rS rSr\rSrg)!FlaxBertForSequenceClassificationiL  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r  L  s	     ;Lr7   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)FlaxBertForMultipleChoiceModulei_  r<   r=   Fr  c                    [        U R                  U R                  U R                  S9U l        [
        R                  " U R                  R                  S9U l        [
        R                  " SU R                  S9U l
        g )Nrw  rC   r#   rq   )r  r<   r=   r  rq  rE   rT   rU   rV   ru   r  rW   s    r8   rY   %FlaxBertForMultipleChoiceModule.setupd  sW    ";;**#'#>#>
	
 zzt{{'F'FG((1DJJ7r7   r[   r   r  r  c
                 :   UR                   S   n
Ub  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nU R                  UUUUUUUUU	S9	nUS   nU R                  XS9nU R	                  U5      nUR                  SU
5      nU	(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )Nr#   r   r  r^   r~   r  )r   r   rq  rV   r  r   r*   r+   )rX   r`   rc   ra   rb   r"  r[   r   r  r  num_choicesr   rd  r  reshaped_logitss                  r8   rf   (FlaxBertForMultipleChoiceModule.__call__m  sF     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ))'/!5#  

  
]P/ ..[9#%33,"!//))
 	
r7   r  Nr  r  r,   r7   r8   r  r  _  sj    {{E399"#(D(8  #"'%* ,
 ,
  ,
 #,
 ,
 ,
r7   r  z
    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       \ rS rSr\rSrg)FlaxBertForMultipleChoicei  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r    s	     3Lr7   r  z(batch_size, num_choices, sequence_lengthc            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)$FlaxBertForTokenClassificationModulei  r<   r=   Fr  c                    [        U R                  U R                  SU R                  S9U l        U R                  R
                  b  U R                  R
                  OU R                  R                  n[        R                  " US9U l	        [        R                  " U R                  R                  U R                  S9U l        g )NFr<   r=   r  r  rC   rq   r  r  s     r8   rY   *FlaxBertForTokenClassificationModule.setup  s    ";;**##'#>#>	
	 {{--9 KK**00 	
 zz'9:((4;;#9#9Lr7   r[   r   r  r  c
                     U R                  UUUUUUUUU	S9	n
U
S   nU R                  XS9nU R                  U5      nU	(       d	  U4U
SS  -   $ [        UU
R                  U
R
                  S9$ )Nr  r   r^   r#   r  )rq  rV   r  r   r*   r+   )rX   r`   rc   ra   rb   r"  r[   r   r  r  r   r*   r  s                r8   rf   -FlaxBertForTokenClassificationModule.__call__  s     ))'/!5#  

  
]P/9wqr{**(!//))
 	
r7   r  Nr  r  r,   r7   r8   r  r    sk    {{E399"#(D(M, #"'%* $
 $
  $
 #$
 $
 $
r7   r  z
    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       \ rS rSr\rSrg)FlaxBertForTokenClassificationi  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r    s	     8Lr7   r  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r    SS\
S\
S	\
S
\
4S jjrSrg)"FlaxBertForQuestionAnsweringModulei  r<   r=   Fr  c                     [        U R                  U R                  SU R                  S9U l        [
        R                  " U R                  R                  U R                  S9U l        g )NFr  rq   )	r  r<   r=   r  rq  rE   ru   r  
qa_outputsrW   s    r8   rY   (FlaxBertForQuestionAnsweringModule.setup  sJ    ";;**##'#>#>	
	 ((4;;#9#9Lr7   r[   r   r  r  c
                 V   U R                  UUUUUUUUU	S9	n
U
S   nU R                  U5      n[        R                  " XR                  R
                  SS9u  pUR                  S5      nUR                  S5      nU	(       d	  X4U
SS  -   $ [        UUU
R                  U
R                  S9$ )Nr  r   r   r   r#   )start_logits
end_logitsr*   r+   )
rq  r   r2   r  r<   r  squeezer   r*   r+   )rX   r`   rc   ra   rb   r"  r[   r   r  r  r   r*   r  r  r  s                  r8   rf   +FlaxBertForQuestionAnsweringModule.__call__  s     ))'/!5#  

  
/#&99V[[5K5KRT#U #++B/''+
 -;;/%!!//))	
 	
r7   )rq  r   Nr  r  r,   r7   r8   r  r    sk    {{E399"#(D(M  #"'%* (
 (
  (
 #(
 (
 (
r7   r  z
    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       \ rS rSr\rSrg)FlaxBertForQuestionAnsweringi3  r,   N)r-   r.   r/   r0   r  rr  r6   r,   r7   r8   r  r  3  s	     6Lr7   r  c                      \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r         SS\\R                     S	\\R                     S
\\R                     S\\R                     S\
S\
S\
S\
S\
4S jjrSrg)FlaxBertForCausalLMModuleiF  r<   r=   Fr  c                     [        U R                  SU R                  U R                  S9U l        [        U R                  U R                  S9U l        g r  r  rW   s    r8   rY   FlaxBertForCausalLMModule.setupK  r  r7   Nra   r"  r   r   r   r[   r   r  r  c                 R   U R                  UUUUUUUUU	U
UUS9nUS   nU R                  R                  (       a#  U R                   R                  S   S   S   S   nOS nU R	                  XS9nU(       d	  U4USS  -   $ [        UUR                  UR                  UR                  S	9$ )
N)r   r   r   r[   r   r  r  r   rK  r  rM   r  rZ  r#   )r  r*   r+   r   )	rq  r<   r  r   r  r   r*   r+   r   )rX   r`   rc   rb   ra   r"  r   r   r   r[   r   r  r  r   r*   rP  r  s                    r8   rf   "FlaxBertForCausalLMModule.__call__T  s      ))"7#9!'/!5#  
  
;;**#yy228<\JK\]^ij# -K9wqr{**4!//))$55	
 	
r7   r  )	NNNNFTFFTr*  r,   r7   r8   r
  r
  F  s    {{E399"#(D(M 15+/7;8< ""'%* 0

 !-0
 CKK(0
  (40
 !) 50
 0
 0
  0
 #0
 0
 0
r7   r
  z
    Bert Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   J    \ rS rSr\rSS\\R                     4S jjr	S r
Srg)FlaxBertForCausalLMi  Nrc   c                 6   UR                   u  pEU R                  XB5      n[        R                  " XB4SS9nUb*  UR	                  SS9S-
  n[
        R                  " XsS5      nO2[        R                  " [        R                  " USS9S S S 24   XE45      nUUUS.$ )Nr]   rq   r   r   r#   )r   r   )r  rc   rb   )	r   r   r2   ry   cumsumr   r   r   r   )	rX   r`   r   rc   r   
seq_lengthr  extended_attention_maskrb   s	            r8   prepare_inputs_for_generation1FlaxBertForCausalLM.prepare_inputs_for_generation  s    !*
//*A #&((J+C4"P%)00b09A=L&)&>&>?Vhn&o#++CJJz,NtUVw,WZdYqrL  /5(
 	
r7   c                 L    UR                   US'   US   S S 2SS 24   S-   US'   U$ )Nr  rb   r   r#   )r  )rX   model_outputsmodel_kwargss      r8   update_inputs_for_generation0FlaxBertForCausalLM.update_inputs_for_generation  s8    *7*G*G&''3N'CArsF'Ka'O^$r7   r,   r   )r-   r.   r/   r0   r
  rr  r   rI   Arrayr  r  r6   r,   r7   r8   r  r    s'     -L
S[\_\e\eSf 
*r7   r  )
r  r  r  r  r  r  r  r  r  rp  )dtypingr   r   flax
flax.linenlinenrE   rI   	jax.numpynumpyr2   rR  flax.core.frozen_dictr   r   r   r   r	   r
   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r    r!   r"   configuration_bertr$   
get_loggerr-   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr  struct	dataclassr&   BERT_START_DOCSTRINGr  r  r:   rl   r   r   r   r   r   r  r,  r4  r<  rB  rT  r^  rg  rp  r  r  r  r  #FLAX_BERT_FOR_PRETRAINING_DOCSTRINGr  r  r  r  r  &FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRINGr  r  r  r  r  r  r  r  r
  r  __all__r,   r7   r8   <module>r5     s    &   
   > > 6 6 > ;     g f * 
		H	%5  4; 4 4:. `$ N( (VhBII hV ('		 'T299 $RYY (6BII 6rN
bii N
b$
bii $
N)RYY )"-bii -ryy .	")) 	4")) 49ryy 9@1 @FD
RYY D
N d"+ "	" ],?A_ap q:
299 :
z  04 00' #&   !>?Bee !(DSb
7
		 7
t NPde-1 - f- 02EGY[j k2
bii 2
j T;(? ;	;* &, %  !>?Bhh !%3Rap
:
bii :
z  ;(? ;; % 	:
bii :
z  3 7 33 4;;<fg 24QSb
8
299 8
v  8%< 88 "$79RTc
6
 6
r  6#: 66  $	>
		 >
B  1 < )	r7   