
    cCi                        S SK r S SKJrJr  S SKJr  S SKrS SKJ	r
  S SK	rS SKJrJrJr  S SKJrJr  S SKJr  SSKJrJrJrJrJrJr  SSKJrJrJrJr  SS	K J!r!J"r"J#r#  S
SK$J%r%  \#RL                  " \'5      r(Sr)Sr*Sr+Sr,S r-S r. " S S\R^                  5      r0 " S S\R^                  5      r1 " S S\R^                  5      r2 " S S\R^                  5      r3 " S S\R^                  5      r4 " S S\R^                  5      r5 " S S\R^                  5      r6 " S  S!\5      r7 " S" S#\R^                  5      r8\!" S$\+5       " S% S&\75      5       r9\" \9\)S\*5         " S' S(\R^                  5      r:\!" S)\+5       " S* S+\75      5       r;\" \;\)\\*5         " S, S-\R^                  5      r<\!" S.\+5       " S/ S0\75      5       r=\" \=\)\\*5         " S1 S2\R^                  5      r>\!" S3\+5       " S4 S5\75      5       r?\" \?\,R                  S65      5        \" \?\)\\*5         " S7 S8\R^                  5      rA\!" S9\+5       " S: S;\75      5       rB\" \B\)\\*5         " S< S=\R^                  5      rC\!" S>\+5       " S? S@\75      5       rD\" \D\)\\*5        / SAQrEg)B    N)CallableOptional)
FrozenDictfreezeunfreeze)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DistilBertConfigzdistilbert-base-uncasedr   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                 x    S[         R                  " SSUS-  -  [         R                  " U5      -  5      -  nX-  $ )Nr   i'     )nppowerfloat32)posid_modelangle_ratess       q/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py
get_anglesr%   `   s6    bhhuqAF|rzz'7J&JKKK    c                    [        [        R                  " U 5      S S 2[        R                  4   [        R                  " U5      [        R                  S S 24   U5      n[        R                  " US S 2SS S24   5      US S 2SS S24'   [        R
                  " US S 2SS S24   5      US S 2SS S24'   U[        R                  S4   n[        R                  " U5      $ )Nr   r   r   .)r%   r   arangenewaxissincosjnparray)positionr"   
angle_radspos_encodings       r$   positional_encodingr1   e   s    BIIh/2::>		'@RSUS]S]_`S`@acjkJ &&Aqt!tG!45Jq!$Q$w &&Aqt!tG!45Jq!$Q$wbjj#o.L99\""r&   c                   r    \ rS rSr% Sr\\S'   \R                  r	\R                  \S'   S r
S
S\4S jjrSrg	)FlaxEmbeddingst   zGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 @   [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l	        U R                  R                  (       d}  [         R                  " U R                  R                  U R                  R                  [
        R                   R                  R                  U R                  R                  S9S9U l        O9[        U R                  R                  U R                  R                  5      U l        [         R                  " SU R                   S9U l        [         R"                  " U R                  R$                  S9U l        g )Nstddev)embedding_init-q=epsilonr6   rate)nnEmbedr5   
vocab_sizedimjaxinitializersnormalinitializer_rangeword_embeddingssinusoidal_pos_embdsmax_position_embeddingsposition_embeddingsr1   r0   	LayerNormr6   Dropoutdropoutselfs    r$   setupFlaxEmbeddings.setupz   s   !xxKK""KKOO66..55T[[=Z=Z5[ 

 {{//')xx33"vv2299A^A^9_(D$ !4DKK4W4WY]YdYdYhYh iDe4::Fzzt{{':':;r&   deterministicc                    UR                   u  p4U R                  UR                  S5      5      nU R                  R                  (       d\  [
        R                  " U5      R                  S5      n[
        R                  " XcU4S9nU R                  UR                  S5      5      nO3U R                  S S 2S U2S S 24   nUR                  UR                  5      nXW-   nU R                  U5      nU R                  XS9nU$ )Ni4)shaperS   )rV   rH   astyper5   rI   r,   r(   broadcast_torK   r0   r6   rL   rN   )	rP   	input_idsrS   
batch_size
seq_lengthinputs_embedsposition_idsposition_embedshidden_statess	            r$   __call__FlaxEmbeddings.__call__   s    !*
,,Y-=-=d-CD{{//::j188>L++LZ@XYL"66|7J7J47PQO"//;J;0ABO-44]5H5HIO &7 }5]Pr&   )rL   rN   r0   rK   rH   NT)__name__
__module____qualname____firstlineno____doc__r   __annotations__r,   r   r6   rQ   boolra   __static_attributes__ r&   r$   r3   r3   t   s3    Q{{E399"<"  r&   r3   c                   v    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	  S
S\
S\
4S jjrSrg	)FlaxMultiHeadSelfAttention   r5   r6   c                    U R                   R                  U l        U R                   R                  U l        [        R                  " U R                   R
                  S9U l        U R                  U R                  -  S:X  d%  [        SU R                   SU R                   35      e[        R                  " U R                  U R                  [        R                  R                  R                  U R                   R                  S9S9U l        [        R                  " U R                  U R                  [        R                  R                  R                  U R                   R                  S9S9U l        [        R                  " U R                  U R                  [        R                  R                  R                  U R                   R                  S9S9U l        [        R                  " U R                  U R                  [        R                  R                  R                  U R                   R                  S9S9U l        g )Nr>   r   Hidden size " not dividable by number of heads r8   r6   kernel_init)r5   n_headsrC   r@   rM   attention_dropoutrN   
ValueErrorDenser6   rD   rE   rF   rG   q_link_linv_linout_linrO   s    r$   rQ    FlaxMultiHeadSelfAttention.setup   s   {{**;;??zzt{{'D'DE4<<'1,|DHH:5WX\XdXdWefggXXHH**++22$++:W:W2X


 XXHH**++22$++:W:W2X


 XXHH**++22$++:W:W2X


 xxHH**++22$++:W:W2X
r&   rS   output_attentionsc           	        ^ ^^ UR                   u  mpxUR                   S   n	T R                  T R                  -  mTSSU	4n
UUU 4S jnUUU 4S jnU" T R                  U5      5      nU" T R	                  U5      5      nU" T R                  U5      5      nU[        R                  " T5      -  n[        R                  " XR                  SSSS5      5      n[        R                  " XJ5      nUR                  UR                  5      nUSSU-
  -  -
  n[        R                  " US	S
9nT R!                  UUS9n[        R                  " UU5      nU" U5      nT R#                  U5      nU(       a  UU4$ U4$ )Nr   c                 d   > U R                  TSTR                  T5      R                  SSSS5      $ )zseparate headsr   r   r   r   )reshaperu   	transposexbsdim_per_headrP   s    r$   rV   2FlaxMultiHeadSelfAttention.__call__.<locals>.shape   s/    99RT\\<@JJ1aQRTUVVr&   c                 h   > U R                  SSSS5      R                  TSTR                  T-  5      $ )zgroup headsr   r   r   r   r   )r   r   ru   r   s    r$   unshape4FlaxMultiHeadSelfAttention.__call__.<locals>.unshape   s0    ;;q!Q*222r4<<,;VWWr&   r   r   r   gꌠ9Y>)Fg      ?r   axisrW   )rV   rC   ru   ry   rz   r{   mathsqrtr,   matmulr   r   rX   r6   r@   softmaxrN   r|   )rP   querykeyvaluemaskrS   r~   q_lenrC   k_len
mask_reshprV   r   qkvscoresweightscontextr   r   s   `                  @@r$   ra   #FlaxMultiHeadSelfAttention.__call__   sJ    E		! xx4<</!Q&
	W	X $**U#$$**S/"$**U#$		,''A{{1aA67{{4,{{6<<($#*--**V"-,,wm,D**Wa('",,w'W%%:r&   )rC   rN   rz   ru   r|   ry   r{   N)TFrd   re   rf   rg   r   ri   r,   r   r6   rQ   rj   ra   rk   rl   r&   r$   rn   rn      sF    {{E399"
F #"'/ /  / /r&   rn   c                   n    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S	S\
4S jjrSrg)
FlaxFFN   r5   r6   c                    [         R                  " U R                  R                  S9U l        U R                  R                  U l        SU l        [         R                  " U R                  R                  U R                  [        R                   R                  R                  U R                  R                  S9S9U l        [         R                  " U R                  R                  U R                  [        R                   R                  R                  U R                  R                  S9S9U l        [         U R                  R"                     U l        g )Nr>   r   r8   rs   )r@   rM   r5   rN   chunk_size_feed_forwardseq_len_dimrx   
hidden_dimr6   rD   rE   rF   rG   lin1rC   lin2r   
activationrO   s    r$   rQ   FlaxFFN.setup   s    zzt{{':':;'+{{'J'J$HHKK""**++22$++:W:W2X
	
 HHKKOO**++22$++:W:W2X
	 !!7!78r&   rS   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  XS9nU$ )NrW   )r   r   r   rN   )rP   r`   rS   s      r$   ra   FlaxFFN.__call__	  sB    		-06		-0]Pr&   )r   r   rN   r   r   r   Nrc   r   rl   r&   r$   r   r      s0    {{E399"9"T  r&   r   c                   v    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	  S
S\
S\
4S jjrSrg	)FlaxTransformerBlocki  r5   r6   c                    U R                   R                  U R                   R                  -  S:X  d5   SU R                   R                   SU R                   R                   35       e[        U R                   U R                  S9U l        [        R                  " SU R                  S9U l        [        U R                   U R                  S9U l
        [        R                  " SU R                  S9U l        g )Nr   rq   rr   r6   r;   r<   )r5   rC   ru   rn   r6   	attentionr@   rL   sa_layer_normr   ffnoutput_layer_normrO   s    r$   rQ   FlaxTransformerBlock.setup  s    {{!4!449 	
4;;??++MdkkNaNaMbc	
9 4DKKtzzR\\%tzzJ4;;djj9!#e4::!Nr&   r~   rS   c           	         U R                  UUUUUUS9nU(       a  Uu  pVO[        U5      [        L d   eUS   nU R                  XQ-   5      nU R	                  XTS9nU R                  Xu-   5      nU4nU(       a  W4U-   nU$ )N)r   r   r   r   r~   rS   r   rW   )r   typetupler   r   r   )	rP   r`   	attn_maskr~   rS   	sa_output
sa_weights
ffn_outputoutputs	            r$   ra   FlaxTransformerBlock.__call__   s     NN/' # 
	 $-!Iz	?e+++!!I&&y'@A	 XXiXE
++J,BC
 ]V+Fr&   )r   r   r   r   N)FTr   rl   r&   r$   r   r     sF    {{E399"	O #("  	
  r&   r   c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxTransformeri@  r5   r6   c           	          [        U R                  R                  5       Vs/ s H+  n[        U R                  [	        U5      U R
                  S9PM-     snU l        g s  snf )N)namer6   )ranger5   n_layersr   strr6   layers)rP   r!   s     r$   rQ   FlaxTransformer.setupD  sL    V[\`\g\g\p\pVq
VqQR 3q6LVq
 
s   2Ar~   output_hidden_statesrS   return_dictc                 T   U(       a  SOS nU(       a  SOS nU R                    HR  n	U(       a  Xq4-   nU	" UUUUS9n
U
S   nU(       a  [        U
5      S:X  d   eU
S   nX4-   nMA  [        U
5      S:X  a  MR   e   U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )	Nrl   )r`   r   r~   rS   r   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fNrl   ).0r   s     r$   	<genexpr>+FlaxTransformer.__call__.<locals>.<genexpr>m  s     h$Vq$Vs   	)last_hidden_stater`   
attentions)r   lenr   r   )rP   r`   attention_maskr~   r   rS   r   all_hidden_statesall_attentionslayer_modulelayer_outputsr   s               r$   ra   FlaxTransformer.__call__I  s     #7BD0d KKL#$58H$H!(+("3+	M *"-M =)Q...*1-
!/-!?=)Q...# ((   14D Dh]DU$Vhhh"+Yg
 	
r&   )r   NFFTFr   rl   r&   r$   r   r   @  s_    {{E399"
 #(%*"!'
  	'

 #'
 '
 '
 '
r&   r   c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxTransformerEncoderis  r5   r6   c                 J    [        U R                  U R                  S9U l        g Nr   )r   r5   r6   layerrO   s    r$   rQ   FlaxTransformerEncoder.setupw  s    $T[[

C
r&   r~   r   rS   r   c           	      *    U R                  UUUUUUS9$ )N)r`   r   r~   r   rS   r   r   )rP   r`   r   r~   r   rS   r   s          r$   ra   FlaxTransformerEncoder.__call__z  s,     zz')/!5'#  
 	
r&   r   Nr   r   rl   r&   r$   r   r   s  s`    {{E399"D #(%*"!
  	

 #
 
 
 
r&   r   c                       \ rS rSr% \\S'   \R                  r\R                  \S'   \	R                  R                  R                  r\S\R                   4   \S'   S rS rSrg	)
FlaxDistilBertLMDecoderi  r5   r6   .	bias_initc                 r    U R                  SU R                  U R                  R                  45      U l        g )Nbias)paramr   r5   rB   r   rO   s    r$   rQ   FlaxDistilBertLMDecoder.setup  s'    JJvt~~8N8N7PQ	r&   c                 6   [         R                  " XR                  5      n[         R                  " X R                  5      n[        R                  " XUR
                  S-
  4S4S45      n[         R                  " U R                  U R                  5      nX4-   nU$ )Nr   )r   )rl   rl   )r,   asarrayr6   r
   dot_generalndimr   )rP   inputskernelyr   s        r$   ra    FlaxDistilBertLMDecoder.__call__  so    VZZ0VZZ0OOFv{{Q.@$-G,RS{{499djj1Hr&   )r   N)rd   re   rf   rg   r   ri   r,   r   r6   rD   r@   rE   zerosr   r   r   ndarrayrQ   ra   rk   rl   r&   r$   r   r     sL    {{E399"+.66+>+>+D+DIxRZZ(DRr&   r   c                     ^  \ rS rSr% Sr\rSrSr\	R                  \S'   SS\R                  S4S	\S
\S\S\R                   S\4
U 4S jjjrSS\R(                  R*                  S
\S\S\4S jjr\" \R5                  S5      5              SS\\   S\R(                  R*                  S\S\\   S\\   S\\   4S jj5       rSrU =r$ )FlaxDistilBertPreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.

distilbertNmodule_class)r   r   r   Tr5   input_shapeseedr6   _do_initc           	      L   > U R                   " SXS.UD6n[        TU ]	  XX#XES9  g )Nr5   r6   )r   r   r6   r   rl   )r   super__init__)	rP   r5   r   r   r6   r   kwargsmodule	__class__s	           r$   r   &FlaxDistilBertPreTrainedModel.__init__  s2     ""H&HH[SXlr&   rngparamsreturnc                    [         R                  " USS9n[         R                  " U5      n[        R                  R                  U5      u  pgXgS.nU R                  R                  XUSS9S   n	Ubd  [        [        U	5      5      n	[        [        U5      5      nU R                   H	  n
X   X:'   M     [        5       U l
        [        [        U5      5      $ U	$ )NrU   r   )r  rN   F)r   r  )r,   r   	ones_likerD   randomsplitr  initr   r   _missing_keyssetr   r	   )rP   r  r   r  rZ   r   
params_rngdropout_rngrngsrandom_paramsmissing_keys              r$   init_weights*FlaxDistilBertPreTrainedModel.init_weights  s    IIk6	y1"%**"2"23"7
$=((.V[(\]ef(-)@AM!(6"23F#11&3&@#  2!$D.011  r&   zbatch_size, sequence_lengthr  trainr~   r   r   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Uc  [        R
                  " U5      n0 n
Ub  XZS'   U R                  R                  SU=(       d    U R                  0[        R                  " USS9[        R                  " USS9U(       + UUU	U
S9$ )NrN   r  rU   r   )r  )
r5   r~   r   r   r,   r  r  applyr  r-   )rP   rZ   r   	head_maskr  r  r  r~   r   r   r  s              r$   ra   &FlaxDistilBertPreTrainedModel.__call__  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY! ]]95N ")O{{  v,-IIit,IInD1I  ! 	
 		
r&   )r  r   )NNNNFNNN) rd   re   rf   rg   rh   r   config_classbase_model_prefixr   r@   Moduleri   r,   r   r   intr6   rj   r   rD   r	  PRNGKeyr   r  r   DISTILBERT_INPUTS_DOCSTRINGformatr   dictra   rk   __classcell__)r  s   @r$   r   r     s>   
 $L$"L"))"
 $;;
m 
m 
m 	
m
 yy
m 
m 
m!

 2 2 ! !PZ !fp !( ++F+M+MNk+lm !%*.,0/3&*#

 #
 ZZ''#
 #
 $D>#
 'tn#
 d^#
 n#
r&   r   c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxDistilBertModulei  r5   r6   c                     [        U R                  U R                  S9U l        [	        U R                  U R                  S9U l        g r   )r3   r5   r6   
embeddingsr   transformerrO   s    r$   rQ   FlaxDistilBertModule.setup  s/    (DJJG1$++TZZPr&   rS   r~   r   r   c           	          Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  XS9nU R                  UUUUUUS9$ )NrW   )r`   r   rS   r~   r   r   )r5   r~   r   r   r&  r'  )rP   rZ   r   rS   r~   r   r   input_embedss           r$   ra   FlaxDistilBertModule.__call__  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBYyN&)'/!5#   
 	
r&   )r&  r'  NTFFTr   rl   r&   r$   r$  r$    s`    {{E399"Q #"'%* 
 	

  
 #
 
 
r&   r$  zdThe bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.c                       \ rS rSr\rSrg)FlaxDistilBertModeli  rl   N)rd   re   rf   rg   r$  r   rk   rl   r&   r$   r.  r.    s	    
 (Lr&   r.  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxDistilBertForMaskedLMModulei  r5   r6   c                    [        U R                  U R                  S9U l        [        R
                  " U R                  R                  U R                  [        R                  R                  R                  U R                  R                  S9S9U l        [        R                  " SU R                  S9U l        U R                  R                  (       a$  [        U R                  U R                  S9U l        g [        R
                  " U R                  R"                  U R                  [        R                  R                  R                  U R                  R                  S9S9U l        g )Nr   r8   rs   r;   r<   )r$  r5   r6   r   r@   rx   rC   rD   rE   rF   rG   vocab_transformrL   vocab_layer_normtie_word_embeddingsr   vocab_projectorrB   rO   s    r$   rQ   %FlaxDistilBertForMaskedLMModule.setup   s    .t{{$**M!xxKKOO**++22$++:W:W2X 

 !#U$** M;;**#:jj$D 
 $&88&&jjFF//66dkk>[>[6\$D r&   rS   r~   r   r   c           	      *   Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   nU R                  U5      n	[        U R                   R
                     " U	5      n	U R                  U	5      n	U R                   R                  (       a>  U R                  R                  S   S   S   S   n
U R                  XR                  5      n	OU R                  U	5      n	U(       d  U	4USS  -   nU$ [        U	UR                  UR                  S9$ )	N)rZ   r   r~   r   rS   r   r   r  r&  rH   	embeddingr   logitsr`   r   )r5   use_return_dictr   r2  r   r   r3  r4  	variablesr5  Tr   r`   r   )rP   rZ   r   rS   r~   r   r   dlbrt_outputr`   prediction_logitsshared_embeddingr   s               r$   ra   (FlaxDistilBertForMaskedLMModule.__call__4  s#    &1%<k$++B]B])/!5'# ' 
 %Q 00?"4;;#9#9:;LM 112CD;;**#88B<PQbcdop $ 4 45FHZHZ [ $ 4 45F G')L,<<FM!$&44#..
 	
r&   )r   r3  r5  r2  Nr,  r   rl   r&   r$   r0  r0    s_    {{E399"0 #"'%* &
 	&

  &
 #&
 &
 &
r&   r0  z8DistilBert Model with a `language modeling` head on top.c                       \ rS rSr\rSrg)FlaxDistilBertForMaskedLMi]  rl   N)rd   re   rf   rg   r0  r   rk   rl   r&   r$   rC  rC  ]  s    2Lr&   rC  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)-FlaxDistilBertForSequenceClassificationModuleie  r5   r6   c                    [        U R                  U R                  S9U l        [        R
                  " U R                  R                  U R                  [        R                  R                  R                  U R                  R                  S9S9U l        [        R                  " U R                  R                  S9U l        [        R
                  " U R                  R                  U R                  S9U l        g )Nr   r8   rs   r>   r   )r$  r5   r6   r   r@   rx   rC   rD   rE   rF   rG   pre_classifierrM   seq_classif_dropoutrN   
num_labels
classifierrO   s    r$   rQ   3FlaxDistilBertForSequenceClassificationModule.setupi  s    .dkkT hhKKOO**++22$++:W:W2X

 zzt{{'F'FG((KK""**
r&   rS   r~   r   r   c           	      Z   Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   nUS S 2S4   n	U R                  U	5      n	[        S   " U	5      n	U R                  XS9n	U R                  U	5      n
U(       d	  U
4USS  -   $ [        U
UR                  UR                  S9$ )NrS   r~   r   r   r   relurW   r   r9  )
r5   r;  r   rG  r   rN   rJ  r   r`   r   )rP   rZ   r   rS   r~   r   r   distilbert_outputhidden_statepooled_outputr:  s              r$   ra   6FlaxDistilBertForSequenceClassificationModule.__call__v  s     &1%<k$++B]B] OO'/!5# , 
 )+$QT*++M:v}5]P/90444++99(33
 	
r&   rJ  r   rN   rG  Nr,  r   rl   r&   r$   rE  rE  e  s_    {{E399"
" #"'%* !
 	!

  !
 #!
 !
 !
r&   rE  z
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       \ rS rSr\rSrg)'FlaxDistilBertForSequenceClassificationi  rl   N)rd   re   rf   rg   rE  r   rk   rl   r&   r$   rU  rU    s
     ALr&   rU  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)%FlaxDistilBertForMultipleChoiceModulei  r5   r6   c                    [        U R                  U R                  S9U l        [        R
                  " U R                  R                  U R                  [        R                  R                  R                  U R                  R                  S9S9U l        [        R                  " U R                  R                  S9U l        [        R
                  " SU R                  S9U l        g )Nr   r8   rs   r>   r   r   )r$  r5   r6   r   r@   rx   rC   rD   rE   rF   rG   rG  rM   rH  rN   rJ  rO   s    r$   rQ   +FlaxDistilBertForMultipleChoiceModule.setup  s    .dkkT hhKKOO**++22$++:W:W2X

 zzt{{'F'FG((**
r&   rS   r~   r   r   c           	      ,   Ub  UOU R                   R                  nUR                  S   nUb  UR                  SUR                  S   5      OS nUb  UR                  SUR                  S   5      OS nU R	                  UUUUUUS9nUS   n	U	S S 2S4   n
U R                  U
5      n
[        S   " U
5      n
U R                  XS9n
U R                  U
5      nUR                  SU5      nU(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )	Nr   r   rM  r   rN  rW   r   r9  )r5   r;  rV   r   r   rG  r   rN   rJ  r   r`   r   )rP   rZ   r   rS   r~   r   r   num_choicesoutputsrP  rQ  r:  reshaped_logitss                r$   ra   .FlaxDistilBertForMultipleChoiceModule.__call__  s5    &1%<k$++B]B]ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMqu //'/!5# " 
 qz$QT*++M:v}5]P/ ..[9#%33,"!//))
 	
r&   rS  Nr,  r   rl   r&   r$   rW  rW    s_    {{E399"
" #"'%* (
 	(

  (
 #(
 (
 (
r&   rW  z
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    c                       \ rS rSr\rSrg)FlaxDistilBertForMultipleChoicei  rl   N)rd   re   rf   rg   rW  r   rk   rl   r&   r$   r`  r`    s	     9Lr&   r`  z(batch_size, num_choices, sequence_lengthc            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)*FlaxDistilBertForTokenClassificationModulei  r5   r6   c                    [        U R                  U R                  S9U l        [        R
                  " U R                  R                  S9U l        [        R                  " U R                  R                  U R                  S9U l	        g )Nr   r>   r   )
r$  r5   r6   r   r@   rM   rN   rx   rI  rJ  rO   s    r$   rQ   0FlaxDistilBertForTokenClassificationModule.setup  sR    .dkkTzzt{{':':;((4;;#9#9Lr&   rS   r~   r   r   c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   nU R                  XS9nU R	                  U5      n	U(       d	  U	4USS  -   $ [        U	UR                  UR                  S9$ )NrM  r   rW   r   r9  )r5   r;  r   rN   rJ  r   r`   r   )
rP   rZ   r   rS   r~   r   r   r\  r`   r:  s
             r$   ra   3FlaxDistilBertForTokenClassificationModule.__call__  s     &1%<k$++B]B]//'/!5# " 
  
]P/9wqr{**(!//))
 	
r&   )rJ  r   rN   Nr,  r   rl   r&   r$   rb  rb    s`    {{E399"M #"'%* 
 	

  
 #
 
 
r&   rb  z
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    c                       \ rS rSr\rSrg)$FlaxDistilBertForTokenClassificationi*  rl   N)rd   re   rf   rg   rb  r   rk   rl   r&   r$   rh  rh  *  s	     >Lr&   rh  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)(FlaxDistilBertForQuestionAnsweringModulei=  r5   r6   c                 L   [        U R                  U R                  S9U l        [        R
                  " U R                  R                  U R                  S9U l        U R                  R                  S:X  d   e[        R                  " U R                  R                  S9U l
        g )Nr   r   r   r>   )r$  r5   r6   r   r@   rx   rI  
qa_outputsrM   
qa_dropoutrN   rO   s    r$   rQ   .FlaxDistilBertForQuestionAnsweringModule.setupA  sj    .dkkT((4;;#9#9L{{%%***zzt{{'='=>r&   rS   r~   r   r   c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   nU R                  XS9nU R	                  U5      n	[
        R                  " XR                   R                  SS9u  pU
R                  S5      n
UR                  S5      nU(       d	  X4USS  -   $ [        U
UUR                  UR                  S9$ )NrM  r   rW   r   r   r   )start_logits
end_logitsr`   r   )r5   r;  r   rN   rl  r,   r
  rI  squeezer   r`   r   )rP   rZ   r   rS   r~   r   r   rO  r`   r:  rp  rq  s               r$   ra   1FlaxDistilBertForQuestionAnsweringModule.__call__G  s     &1%<k$++B]B] !OO'/!5# , 
 *!,]P/#&99V[[5K5KRT#U #++B/''+
 -0A!"0EEE/%!+99(33	
 	
r&   )r   rN   rl  Nr,  r   rl   r&   r$   rj  rj  =  s_    {{E399"? #"'%* %
 	%

  %
 #%
 %
 %
r&   rj  z
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       \ rS rSr\rSrg)"FlaxDistilBertForQuestionAnsweringio  rl   N)rd   re   rf   rg   rj  r   rk   rl   r&   r$   ru  ru  o  s	     <Lr&   ru  )rC  r`  ru  rU  rh  r.  r   )Fr   typingr   r   
flax.linenlinenr@   rD   	jax.numpynumpyr,   r   flax.core.frozen_dictr   r   r   flax.traverse_utilr   r	   r
   modeling_flax_outputsr   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_distilbertr   
get_loggerrd   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCFLAX_DISTILBERT_START_DOCSTRINGr  r%   r1   r  r3   rn   r   r   r   r   r   r   r$  r.  r0  rC  rE  rU  rW  r`  r   rb  rh  rj  ru  __all__rl   r&   r$   <module>r     s     %  
   > > ;   w v Y Y 6 
		H	%/ $# . 6
#*RYY *ZP Pfbii :,299 ,^0
bii 0
f
RYY 
4bii "N
$7 N
b
299 
D j#(7 (	( 02Et_ ]>
bii >
B TVuv3 = 3 w3 68KM_ap q2
BII 2
j  $A.K AA + 	9
BII 9
x  $9&C 99 #%@%G%GHr%s #!	(
 (
V  $>+H >> (	/
ryy /
d  $<)F << &$	r&   