
    bCiK                     \   S SK JrJr  S SKrS SKJr  S SKrS SKJ	r
  S SK	rS SKJrJrJr  S SKJr  S SKJrJr  S SKJr  SSKJrJrJrJrJrJrJr  SS	KJrJ r J!r!J"r"J#r#  SS
K$J%r%J&r&J'r'J(r(  SSK)J*r*  \(RV                  " \,5      r-Sr.Sr/\R`                  Rb                   " S S\%5      5       r2Sr3Sr4 " S S\Rj                  5      r6 " S S\Rj                  5      r7 " S S\Rj                  5      r8 " S S\Rj                  5      r9 " S S\Rj                  5      r: " S S\Rj                  5      r; " S S \Rj                  5      r< " S! S"\Rj                  5      r= " S# S$\Rj                  5      r> " S% S&\ 5      r? " S' S(\Rj                  5      r@\&" S)\35       " S* S+\?5      5       rA\!" \A\.\\/5         " S, S-\Rj                  5      rB\&" S.\35       " S/ S0\?5      5       rCS1rD\#" \C\4R                  S25      \D-   5        \"" \C\2\/S39   " S4 S5\Rj                  5      rF\&" S6\35       " S7 S8\?5      5       rG\!" \G\.\\/S9S:9   " S; S<\Rj                  5      rH\&" S=\35       " S> S?\?5      5       rI\!" \I\.\\/5         " S@ SA\Rj                  5      rJ\&" SB\35       " SC SD\?5      5       rK\#" \K\4R                  SE5      5        \!" \K\.\\/5         " SF SG\Rj                  5      rL\&" SH\35       " SI SJ\?5      5       rM\!" \M\.\\/5         " SK SL\Rj                  5      rN\&" SM\35       " SN SO\?5      5       rO\!" \O\.\\/5        / SPQrPg)Q    )CallableOptionalN)
FrozenDictfreezeunfreeze)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxBaseModelOutputWithPoolingFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )AlbertConfigzalbert/albert-base-v2r   c                       \ rS rSr% SrSr\R                  \S'   Sr	\R                  \S'   Sr
\\\R                        \S'   Sr\\\R                        \S'   Srg)	FlaxAlbertForPreTrainingOutput6   a  
Output type of [`FlaxAlbertForPreTraining`].

Args:
    prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nprediction_logits
sop_logitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r"   jnpndarray__annotations__r#   r$   r   tupler%   __static_attributes__r&       i/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/albert/modeling_flax_albert.pyr    r    6   sV    , &*s{{)"J"26M8E#++./6/3Js{{+,3r1   r    a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   r    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S
S\4S jjrSrg	)FlaxAlbertEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l	        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l        [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l        [         R                  " U R                  R                  U R                   S9U l        [         R"                  " U R                  R$                  S9U l        g )N)stddev)embedding_initepsilonr7   rate)nnEmbedr6   
vocab_sizeembedding_sizejaxinitializersnormalinitializer_rangeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr7   Dropouthidden_dropout_probdropoutselfs    r2   setupFlaxAlbertEmbeddings.setup   s5   !xxKK""KK&&66..55T[[=Z=Z5[ 

 $&88KK//KK&&66..55T[[=Z=Z5[$
 
 &(XXKK''KK&&66..55T[[=Z=Z5[&
"
 dkk.H.HPTPZPZ[zzt{{'F'FGr1   deterministicc                    U R                  UR                  S5      5      nU R                  UR                  S5      5      nU R                  UR                  S5      5      nXW-   U-   nU R	                  U5      nU R                  XS9nU$ )Ni4rU   )rG   astyperI   rK   rL   rP   )	rR   	input_idstoken_type_idsposition_idsrU   inputs_embedsposition_embedsrK   r$   s	            r2   __call__FlaxAlbertEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &=O }5]Pr1   )rL   rP   rI   rK   rG   NT)r'   r(   r)   r*   r+   r   r.   r,   float32r7   rS   boolr_   r0   r&   r1   r2   r4   r4      s4    Q{{E399"H&t  r1   r4   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxAlbertSelfAttention   r6   r7   c                    U R                   R                  U R                   R                  -  S:w  a  [        S5      e[        R
                  " U R                   R                  U R                  [        R                  R                  R                  U R                   R                  5      S9U l        [        R
                  " U R                   R                  U R                  [        R                  R                  R                  U R                   R                  5      S9U l        [        R
                  " U R                   R                  U R                  [        R                  R                  R                  U R                   R                  5      S9U l        [        R
                  " U R                   R                  [        R                  R                  R                  U R                   R                  5      U R                  S9U l        [        R                  " U R                   R                   U R                  S9U l        [        R"                  " U R                   R$                  S9U l        g )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r7   kernel_initrh   r7   r;   r=   )r6   hidden_sizenum_attention_heads
ValueErrorr?   Denser7   rC   rD   rE   rF   querykeyvaluedenserL   rM   rN   rO   rP   rQ   s    r2   rS   FlaxAlbertSelfAttention.setup   s   ;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ


 XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr1   output_attentionsc                 Z   U R                   R                  U R                   R                  -  nU R                  U5      R	                  UR
                  S S U R                   R                  U4-   5      nU R                  U5      R	                  UR
                  S S U R                   R                  U4-   5      nU R                  U5      R	                  UR
                  S S U R                   R                  U4-   5      nUb  [        R                  " USS9n[        R                  " US:  [        R                  " UR
                  S5      R                  U R                  5      [        R                  " UR
                  [        R                  " U R                  5      R                   5      R                  U R                  5      5      n	OS n	S n
U(       d+  U R                   R"                  S:  a  U R%                  S5      n
['        UUU	U
U R                   R"                  SUU R                  S S9	n[        R(                  " S	X5      nUR	                  UR
                  S S S
-   5      nU R+                  U5      nU R-                  XS9nU R/                  X-   5      nU(       a  X4nU$ U4nU$ )N   )axisr   g        rP   T)biasdropout_rngdropout_ratebroadcast_dropoutrU   r7   	precisionz...hqk,...khd->...qhd)rX   )r6   rj   rk   rn   reshapeshaperp   ro   r,   expand_dimsr   selectfullrY   r7   finfominattention_probs_dropout_probmake_rngr   einsumrq   rP   rL   )rR   r$   attention_maskrU   rs   head_dimquery_statesvalue_states
key_statesattention_biasr{   attn_weightsattn_outputprojected_attn_outputlayernormed_attn_outputoutputss                   r2   r_    FlaxAlbertSelfAttention.__call__   sP   ;;**dkk.M.MMzz-088#t{{'F'F&QQ
 zz-088#t{{'F'F&QQ
 XXm,44#t{{'F'F&QQ


 % __^(KN ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 jj!8,U!))+*;*;BQ*?%*GH $

; 7 $-B `"&..1F1V"W=N*9 VmTnr1   )rL   rq   rP   ro   rn   rp   NTFr'   r(   r)   r*   r   r.   r,   rb   r7   rS   rc   r_   r0   r&   r1   r2   re   re      s2    {{E399"H<0]a 0 0r1   re   c                   v    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	  S
S\
S\
4S jjrSrg	)FlaxAlbertLayeri  r6   r7   c                 (   [        U R                  U R                  S9U l        [        R
                  " U R                  R                  [        R                  R                  R                  U R                  R                  5      U R                  S9U l        [        U R                  R                     U l        [        R
                  " U R                  R                  [        R                  R                  R                  U R                  R                  5      U R                  S9U l        [        R"                  " U R                  R$                  U R                  S9U l        [        R(                  " U R                  R*                  S9U l        g )Nr7   ri   r;   r=   )re   r6   r7   	attentionr?   rm   intermediate_sizerC   rD   rE   rF   ffnr   
hidden_act
activationrj   
ffn_outputrL   rM   full_layer_layer_normrN   rO   rP   rQ   s    r2   rS   FlaxAlbertLayer.setup  s    0DJJO88KK))++224;;3P3PQ**

 !!7!78((KK##++224;;3P3PQ**

 &(\\$++:T:T\`\f\f%g"zzt{{'F'FGr1   rU   rs   c                     U R                  XX4S9nUS   nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  XsS9nU R                  Xv-   5      nU4nU(       a  XS   4-  nU$ )NrU   rs   r   rX   r   )r   r   r   r   rP   r   )	rR   r$   r   rU   rs   attention_outputsattention_outputr   r   s	            r2   r_   FlaxAlbertLayer.__call__)  s     !NN + 
 -Q/XX./
__Z0
__Z0
\\*\J
22:3PQ "!,..Gr1   )r   r   rP   r   r   r   Nr   r   r&   r1   r2   r   r     sF    {{E399"H( #"' 	
   r1   r   c                   |    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	   SS\
S\
S\
4S jjrS	rg
)FlaxAlbertLayerCollectioniA  r6   r7   c           	          [        U R                  R                  5       Vs/ s H+  n[        U R                  [	        U5      U R
                  S9PM-     snU l        g s  snf )N)namer7   )ranger6   inner_group_numr   strr7   layersrR   is     r2   rS   FlaxAlbertLayerCollection.setupE  sJ    QVW[WbWbWrWrQs
QsAODKKc!fDJJGQs
 
s   2ArU   rs   output_hidden_statesc                     SnSn[        U R                  5       H0  u  pU	" UUUUS9n
U
S   nU(       a  XzS   4-   nU(       d  M+  Xa4-   nM2     U4nU(       a  X4-   nU(       a  X4-   nU$ )Nr&   r   r   r   )	enumerater   )rR   r$   r   rU   rs   r   layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputr   s               r2   r_   "FlaxAlbertLayerCollection.__call__J  s     !)24;;)?%K'+"3	L )OM #3A6H#H ##&9<L&L# *@ !" 66G 33Gr1   r   NTFFr   r&   r1   r2   r   r   A  sR    {{E399"
 #"'%* 	
   # r1   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\   \S'   S r   SS\S\S	\4S
 jjrSrg)FlaxAlbertLayerCollectionsil  r6   r7   Nr   c                 J    [        U R                  U R                  S9U l        g )Nr   )r   r6   r7   albert_layersrQ   s    r2   rS    FlaxAlbertLayerCollections.setupq  s    6t{{$**Ur1   rU   rs   r   c                 ,    U R                  UUUUUS9nU$ NrU   rs   r   r   )rR   r$   r   rU   rs   r   r   s          r2   r_   #FlaxAlbertLayerCollections.__call__t  s/     $$'/!5 % 
 r1   r   r   )r'   r(   r)   r*   r   r.   r,   rb   r7   r   r   r   rS   rc   r_   r0   r&   r1   r2   r   r   l  sa    {{E399"!%K#%V #"'%* 	
   # r1   r   c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxAlbertLayerGroupsi  r6   r7   c           
          [        U R                  R                  5       Vs/ s H5  n[        U R                  [	        U5      [	        U5      U R
                  S9PM7     snU l        g s  snf )N)r   r   r7   )r   r6   num_hidden_groupsr   r   r7   r   r   s     r2   rS   FlaxAlbertLayerGroups.setup  sV     4;;889
9 't{{QSQRV[_[e[ef9
 
s   <A'rU   rs   r   return_dictc           	         U(       a  SOS nU(       a  U4OS n[        U R                  R                  5       Hs  n	[        XR                  R                  U R                  R                  -  -  5      n
U R
                  U
   " UUUUUS9nUS   nU(       a  X{S   -   nU(       d  Mn  X4-   nMu     U(       d  [        S XU4 5       5      $ [        XUS9$ )Nr&   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fNr&   ).0vs     r2   	<genexpr>1FlaxAlbertLayerGroups.__call__.<locals>.<genexpr>  s     h$Vq$Vs   	)last_hidden_stater$   r%   )r   r6   num_hidden_layersintr   r   r/   r   )rR   r$   r   rU   rs   r   r   all_attentionsall_hidden_statesr   	group_idxlayer_group_outputs               r2   r_   FlaxAlbertLayerGroups.__call__  s      1d0D],$t{{445AA!>!>A^A^!^_`I!%Y!7+"3%9" /q1M !/R2H!H##$58H$H!! 6$ h]~$Vhhh"+Yg
 	
r1   r   NTFFTr   r&   r1   r2   r   r     s_    {{E399"
 #"'%* "
 	"

  "
 #"
 "
 "
r1   r   c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxAlbertEncoderi  r6   r7   c                 2   [         R                  " U R                  R                  [        R                   R
                  R                  U R                  R                  5      U R                  S9U l	        [        U R                  U R                  S9U l        g )Nri   r   )r?   rm   r6   rj   rC   rD   rE   rF   r7   embedding_hidden_mapping_inr   albert_layer_groupsrQ   s    r2   rS   FlaxAlbertEncoder.setup  sb    +-88KK##++224;;3P3PQ**,
(
 $9DJJ#W r1   rU   rs   r   r   c                 J    U R                  U5      nU R                  UUUUUS9$ r   )r   r   )rR   r$   r   rU   rs   r   r   s          r2   r_   FlaxAlbertEncoder.__call__  s;     88G'''/!5 ( 
 	
r1   )r   r   Nr   r   r&   r1   r2   r   r     s`    {{E399"X #"'%* 
 	

  
 #
 
 
r1   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   \	R                  R                  R                  r\S\R                   4   \S'   S rS
S jrS	rg)FlaxAlbertOnlyMLMHeadi  r6   r7   .	bias_initc                    [         R                  " U R                  R                  U R                  S9U l        [        U R                  R                     U l        [         R                  " U R                  R                  U R                  S9U l	        [         R                  " U R                  R                  U R                  SS9U l        U R                  SU R                  U R                  R                  45      U l        g )Nr   r;   F)r7   use_biasrz   )r?   rm   r6   rB   r7   rq   r   r   r   rL   rM   rA   decoderparamr   rz   rQ   s    r2   rS   FlaxAlbertOnlyMLMHead.setup  s    XXdkk88

K
 !7!78dkk.H.HPTPZPZ[xx 6 6djjSXYJJvt~~8N8N7PQ	r1   Nc                    U R                  U5      nU R                  U5      nU R                  U5      nUb+  U R                  R	                  SSUR
                  00U5      nOU R                  U5      nXR                  -  nU$ )Nparamskernel)rq   r   rL   r   applyTrz   )rR   r$   shared_embeddings      r2   r_   FlaxAlbertOnlyMLMHead.__call__  sz    

=16}5' LL..8EUEWEW:X/Y[hiM LL7M"r1   )rL   r   rz   r   rq   r   )r'   r(   r)   r*   r   r.   r,   rb   r7   rC   r?   rD   zerosr   r   npr-   rS   r_   r0   r&   r1   r2   r   r     sL    {{E399"+.66+>+>+D+DIxRZZ(DRr1   r   c                   f    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	SS jr
Srg)	FlaxAlbertSOPHeadi  r6   r7   c                     [         R                  " U R                  R                  5      U l        [         R
                  " SU R                  S9U l        g )Nru   r   )r?   rN   r6   classifier_dropout_probrP   rm   r7   
classifierrQ   s    r2   rS   FlaxAlbertSOPHead.setup  s2    zz$++"E"EF((1DJJ7r1   c                 F    U R                  XS9nU R                  U5      nU$ )NrX   )rP   r   )rR   pooled_outputrU   logitss       r2   r_   FlaxAlbertSOPHead.__call__  s%    ]P/r1   )r   rP   Nra   )r'   r(   r)   r*   r   r.   r,   rb   r7   rS   r_   r0   r&   r1   r2   r   r     s$    {{E399"8r1   r   c                     ^  \ rS rSr% Sr\rSrSr\	R                  \S'   SS\R                  S4S	\S
\S\S\R                   S\4
U 4S jjjrSS\R(                  R*                  S
\S\S\4S jjr\" \R5                  S5      5               SS\\   S\R(                  R*                  S\S\\   S\\   S\\   4S jj5       rSrU =r$ )FlaxAlbertPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
albertNmodule_class)r   r   r   Tr6   input_shapeseedr7   _do_initc           	      L   > U R                   " SXS.UD6n[        TU ]	  XX#XES9  g )Nr6   r7   )r  r  r7   r  r&   )r  super__init__)	rR   r6   r  r  r7   r  kwargsmodule	__class__s	           r2   r  "FlaxAlbertPreTrainedModel.__init__  s2     ""H&HH[SXlr1   rngr   returnc           	      v   [         R                  " USS9n[         R                  " U5      n[         R                  " [         R                  " [         R
                  " U5      R                  S   5      U5      n[         R                  " U5      n[        R                  R                  U5      u  pXS.n
U R                  R                  XXuUSS9S   nUbd  [        [        U5      5      n[        [        U5      5      nU R                   H	  nX   X<'   M     [!        5       U l        [#        [%        U5      5      $ U$ )NrW   r   r   )r   rP   F)r   r   )r,   r   
zeros_likebroadcast_toarange
atleast_2dr   	ones_likerC   randomsplitr  initr	   r   _missing_keyssetr   r
   )rR   r  r  r   rZ   r[   r\   r   
params_rngr{   rngsrandom_paramsmissing_keys                r2   init_weights&FlaxAlbertPreTrainedModel.init_weights  s
   IIk6		2''

3>>)3L3R3RSU3V(WYdey1"%**"2"23"7
$=((^\W\ ) 

 (-)@AM!(6"23F#11&3&@#  2!$D.011  r1   batch_size, sequence_lengthr{   trainrs   r   r   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Uc  [        R
                  " U5      nUcV  [        R                  " [        R                  " [        R                  " U5      R                  S   5      UR                  5      nUc  [        R                  " U5      n0 nUb  XkS'   U R                  R                  SU=(       d    U R                  0[        R                  " USS9[        R                  " USS9[        R                  " USS9[        R                  " USS9U(       + UU	U
US9
$ )Nr   rP   r   rW   r   )r  )r6   rs   r   r   r,   r  r  r  r  r   r  r  r   r   array)rR   rZ   r   r[   r\   r   r{   r%  rs   r   r   r  s               r2   r_   "FlaxAlbertPreTrainedModel.__call__*  sC    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ^^I6N++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N ")O{{  v,-IIit,IInD1IInD1IIl$/I  ! 
 	
r1   )r  r   )	NNNNNFNNN) r'   r(   r)   r*   r+   r   config_classbase_model_prefixr  r?   Moduler.   r,   rb   r/   r   r7   rc   r  rC   r  PRNGKeyr   r"  r   ALBERT_INPUTS_DOCSTRINGformatr   dictr_   r0   __classcell__)r  s   @r2   r  r    sA   
  L "L"))"
 $;;
m
m 
m 	
m
 yy
m 
m 
m!

 2 2 ! !PZ !fp !0 ++B+I+IJg+hi !%*.,0/3&*-
 -
 ZZ''-
 -
 $D>-
 'tn-
 d^-
 j-
r1   r  c                       \ rS rSr% \\S'   \R                  r\R                  \S'   Sr	\
\S'   S r      SS\\R                     S	\\R                     S
\
S\
S\
S\
4S jjrSrg)FlaxAlbertModulei[  r6   r7   Tadd_pooling_layerc                    [        U R                  U R                  S9U l        [	        U R                  U R                  S9U l        U R                  (       a  [        R                  " U R                  R                  [        R                  R                  R                  U R                  R                  5      U R                  SS9U l        [        R                  U l        g S U l        S U l        g )Nr   pooler)rh   r7   r   )r4   r6   r7   
embeddingsr   encoderr3  r?   rm   rj   rC   rD   rE   rF   r5  tanhpooler_activationrQ   s    r2   rS   FlaxAlbertModule.setup`  s    .t{{$**M(DJJG!!((''FF//66t{{7T7TUjj	DK &(WWD"DK%)D"r1   Nr[   r\   rU   rs   r   r   c	           	      *   Uc  [         R                  " U5      nUcV  [         R                  " [         R                  " [         R                  " U5      R
                  S   5      UR
                  5      nU R                  XXES9n	U R                  U	UUUUUS9n
U
S   n	U R                  (       a*  U R                  U	S S 2S4   5      nU R                  U5      nOS nU(       d  Uc	  U	4U
SS  -   $ X4U
SS  -   $ [        U	UU
R                  U
R                  S9$ )Nr   rX   rU   rs   r   r   r   r   )r   pooler_outputr$   r%   )r,   r  r  r  r  r   r6  r7  r3  r5  r9  r   r$   r%   )rR   rZ   r   r[   r\   rU   rs   r   r   r$   r   pooleds               r2   r_   FlaxAlbertModule.__call__o  s#    ! ^^I6N ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL	<m,,'/!5#  
  
!![[q!t!45F++F3FF~%''!"+55!*WQR[88-+ !//))	
 	
r1   )r6  r7  r5  r9  )NNTFFT)r'   r(   r)   r*   r   r.   r,   rb   r7   r3  rc   rS   r   r   r-   r_   r0   r&   r1   r2   r2  r2  [  s    {{E399""t"*& 04-1""'%* /
 !,	/

 rzz*/
 /
  /
 #/
 /
 /
r1   r2  z`The bare Albert Model transformer outputting raw hidden-states without any specific head on top.c                       \ rS rSr\rSrg)FlaxAlbertModeli  r&   N)r'   r(   r)   r*   r2  r  r0   r&   r1   r2   rA  rA    s	    
 $Lr1   rA  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxAlbertForPreTrainingModulei  r6   r7   c                     [        U R                  U R                  S9U l        [	        U R                  U R                  S9U l        [        U R                  U R                  S9U l        g )Nr
  )r2  r6   r7   r  r   predictionsr   sop_classifierrQ   s    r2   rS   $FlaxAlbertForPreTrainingModule.setup  sF    &dkkL04::V/t{{$**Ur1   rU   rs   r   r   c	                 ^   U R                  UUUUUUUUS9n	U R                  R                  (       a#  U R                   R                  S   S   S   S   n
OS n
U	S   nU	S   nU R	                  XS9nU R                  XS	9nU(       d	  X4U	S
S  -   $ [        UUU	R                  U	R                  S9$ )Nr<  r   r6  rG   	embeddingr   r   r   rX   ru   )r"   r#   r$   r%   )	r  r6   tie_word_embeddings	variablesrE  rF  r    r$   r%   )rR   rZ   r   r[   r\   rU   rs   r   r   r   r   r$   r   prediction_scores
sop_scoress                  r2   r_   'FlaxAlbertForPreTrainingModule.__call__  s     ++'/!5#  	
 ;;**#{{44X>|LM^_`kl#

 ,,],^(((T
%2WQR[@@-/!!//))	
 	
r1   )r  rE  rF  Nr   r   r&   r1   r2   rC  rC    s`    {{E399"V #"'%* *
 *
  *
 #*
 *
 *
r1   rC  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       \ rS rSr\rSrg)FlaxAlbertForPreTrainingi  r&   N)r'   r(   r)   r*   rC  r  r0   r&   r1   r2   rQ  rQ    s	     2Lr1   rQ  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.sop_logits
    ```
r$  )output_typer)  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxAlbertForMaskedLMModulei	  r6   r7   c                     [        U R                  SU R                  S9U l        [	        U R                  U R                  S9U l        g )NF)r6   r3  r7   r
  )r2  r6   r7   r  r   rE  rQ   s    r2   rS   !FlaxAlbertForMaskedLMModule.setup  s4    &dkkUZ^ZdZde04::Vr1   rU   rs   r   r   c	                 4   U R                  UUUUUUUUS9n	U	S   n
U R                  R                  (       a#  U R                   R                  S   S   S   S   nOS nU R	                  XS9nU(       d	  U4U	SS  -   $ [        UU	R                  U	R                  S	9$ )
Nr<  r   r   r6  rG   rI  rJ  r   r   r$   r%   )r  r6   rK  rL  rE  r   r$   r%   )rR   rZ   r   r[   r\   rU   rs   r   r   r   r$   r   r   s                r2   r_   $FlaxAlbertForMaskedLMModule.__call__  s     ++'/!5#  	
  
;;**#{{44X>|LM^_`kl# !!-!S9wqr{**!!//))
 	
r1   )r  rE  Nr   r   r&   r1   r2   rT  rT  	  s`    {{E399"W #"'%* '
 '
  '
 #'
 '
 '
r1   rT  z4Albert Model with a `language modeling` head on top.c                       \ rS rSr\rSrg)FlaxAlbertForMaskedLMi;  r&   N)r'   r(   r)   r*   rT  r  r0   r&   r1   r2   r[  r[  ;  s    .Lr1   r[  z
refs/pr/11)revisionc            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg))FlaxAlbertForSequenceClassificationModuleiE  r6   r7   c                 r   [        U R                  U R                  S9U l        U R                  R                  b  U R                  R                  OU R                  R
                  n[        R                  " US9U l        [        R                  " U R                  R                  U R                  S9U l        g )Nr
  r=   r   r2  r6   r7   r  r   rO   r?   rN   rP   rm   
num_labelsr   rR   classifier_dropouts     r2   rS   /FlaxAlbertForSequenceClassificationModule.setupI  s    &dkkL {{22> KK//00 	
 zz'9:((KK""**
r1   rU   rs   r   r   c	                     U R                  UUUUUUUUS9n	U	S   n
U R                  XS9n
U R                  U
5      nU(       d	  U4U	SS  -   $ [        UU	R                  U	R
                  S9$ )Nr<  r   rX   ru   rX  )r  rP   r   r   r$   r%   )rR   rZ   r   r[   r\   rU   rs   r   r   r   r   r   s               r2   r_   2FlaxAlbertForSequenceClassificationModule.__call__V  s     ++'/!5#  	
  
]P/9wqr{**+!//))
 	
r1   r  r   rP   Nr   r   r&   r1   r2   r^  r^  E  s_    {{E399"
& #"'%* "
 "
  "
 #"
 "
 "
r1   r^  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       \ rS rSr\rSrg)#FlaxAlbertForSequenceClassificationi{  r&   N)r'   r(   r)   r*   r^  r  r0   r&   r1   r2   ri  ri  {  s	     =Lr1   ri  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)!FlaxAlbertForMultipleChoiceModulei  r6   r7   c                     [        U R                  U R                  S9U l        [        R
                  " U R                  R                  S9U l        [        R                  " SU R                  S9U l	        g )Nr
  r=   r   r   )
r2  r6   r7   r  r?   rN   rO   rP   rm   r   rQ   s    r2   rS   'FlaxAlbertForMultipleChoiceModule.setup  sH    &dkkLzzt{{'F'FG((1DJJ7r1   rU   rs   r   r   c	                 8   UR                   S   n	Ub  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nUb  UR                  SUR                   S   5      OS nU R                  UUUUUUUUS9n
U
S   nU R                  XS9nU R	                  U5      nUR                  SU	5      nU(       d	  U4U
SS  -   $ [        UU
R                  U
R                  S9$ )Nr   r   r<  rX   ru   rX  )r   r   r  rP   r   r   r$   r%   )rR   rZ   r   r[   r\   rU   rs   r   r   num_choicesr   r   r   reshaped_logitss                 r2   r_   *FlaxAlbertForMultipleChoiceModule.__call__  sC     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ++'/!5#  	
  
]P/ ..[9#%33,"!//))
 	
r1   rg  Nr   r   r&   r1   r2   rk  rk    s_    {{E399"8 #"'%* *
 *
  *
 #*
 *
 *
r1   rk  z
    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       \ rS rSr\rSrg)FlaxAlbertForMultipleChoicei  r&   N)r'   r(   r)   r*   rk  r  r0   r&   r1   r2   rs  rs    s	     5Lr1   rs  z(batch_size, num_choices, sequence_lengthc            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)&FlaxAlbertForTokenClassificationModulei  r6   r7   c                 t   [        U R                  U R                  SS9U l        U R                  R                  b  U R                  R                  OU R                  R
                  n[        R                  " US9U l        [        R                  " U R                  R                  U R                  S9U l        g )NFr6   r7   r3  r=   r   r`  rb  s     r2   rS   ,FlaxAlbertForTokenClassificationModule.setup  s    &dkk_de {{22> KK//00 	
 zz'9:((4;;#9#9Lr1   rU   rs   r   r   c	                     U R                  UUUUUUUUS9n	U	S   n
U R                  XS9n
U R                  U
5      nU(       d	  U4U	SS  -   $ [        UU	R                  U	R
                  S9$ )Nr<  r   rX   r   rX  )r  rP   r   r   r$   r%   )rR   rZ   r   r[   r\   rU   rs   r   r   r   r$   r   s               r2   r_   /FlaxAlbertForTokenClassificationModule.__call__  s     ++'/!5#  	
  
]P/9wqr{**(!//))
 	
r1   rg  Nr   r   r&   r1   r2   ru  ru    s`    {{E399"M  #"'%* "
 "
  "
 #"
 "
 "
r1   ru  z
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       \ rS rSr\rSrg) FlaxAlbertForTokenClassificationi  r&   N)r'   r(   r)   r*   ru  r  r0   r&   r1   r2   r|  r|    s	     :Lr1   r|  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)$FlaxAlbertForQuestionAnsweringModulei   r6   r7   c                     [        U R                  U R                  SS9U l        [        R
                  " U R                  R                  U R                  S9U l        g )NFrw  r   )r2  r6   r7   r  r?   rm   ra  
qa_outputsrQ   s    r2   rS   *FlaxAlbertForQuestionAnsweringModule.setup$  s;    &dkk_de((4;;#9#9Lr1   rU   rs   r   r   c	                 T   U R                  UUUUUUUUS9n	U	S   n
U R                  U
5      n[        R                  " XR                  R
                  SS9u  pUR                  S5      nUR                  S5      nU(       d	  X4U	SS  -   $ [        UUU	R                  U	R                  S9$ )Nr<  r   r   rx   r   )start_logits
end_logitsr$   r%   )
r  r  r,   r  r6   ra  squeezer   r$   r%   )rR   rZ   r   r[   r\   rU   rs   r   r   r   r$   r   r  r  s                 r2   r_   -FlaxAlbertForQuestionAnsweringModule.__call__(  s     ++'/!5#  	
  
/#&99V[[5K5KRT#U #++B/''+
 -;;/%!!//))	
 	
r1   )r  r  Nr   r   r&   r1   r2   r~  r~     s`    {{E399"M #"'%* &
 &
  &
 #&
 &
 &
r1   r~  z
    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       \ rS rSr\rSrg)FlaxAlbertForQuestionAnsweringiQ  r&   N)r'   r(   r)   r*   r~  r  r0   r&   r1   r2   r  r  Q  s	     8Lr1   r  )r  rA  rQ  r[  ri  rs  r|  r  )Qtypingr   r   flax
flax.linenlinenr?   rC   	jax.numpynumpyr,   r   flax.core.frozen_dictr   r   r   flax.linen.attentionr   flax.traverse_utilr	   r
   r   modeling_flax_outputsr   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r   r   configuration_albertr   
get_loggerr'   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCstruct	dataclassr    ALBERT_START_DOCSTRINGr-  r+  r4   re   r   r   r   r   r   r   r   r  r2  rA  rC  rQ  %FLAX_ALBERT_FOR_PRETRAINING_DOCSTRINGr.  rT  r[  r^  ri  rk  rs  ru  r|  r~  r  __all__r&   r1   r2   <module>r     s    &   
   > > > ;     g f . 
		H	%-   4[ 4 4:! F B%299 %PRbii Rj)bii )X(		 (V 4,
BII ,
^
		 
>BII 4		 \
 3 \
~C
ryy C
L f$/ $	$ _.ACacr s3
RYY 3
l  28 22) %& ""#@ADii !*HWf
/
")) /
d PRhi/5 / j/ .0BO^j
3
		 3
l  =*C == ' 	3
		 3
l  5"; 55 !8!?!?@j!k !	0
RYY 0
f  :'@ :: $	.
299 .
b  8%> 88 "$		r1   