
    bCig                     *   S SK JrJrJr  S SKrS SKJr  S SKrS SK	J
r  S SKJrJrJr  S SKJrJr  S SKJr  S SKJrJr  S SKJr  SS	KJrJr  SS
KJrJrJrJr  SSK J!r!J"r"J#r#  SSK$J%r%J&r&J'r'  \#RP                  " \)5      r*Sr+Sr,Sr-Sr.\R^                  R`                   " S S\!5      5       r1\R^                  R`                   " S S\!5      5       r2 " S S\Rf                  5      r4 " S S\Rf                  5      r5 " S S\Rf                  5      r6 " S S\Rf                  5      r7 " S S\Rf                  5      r8 " S  S!\Rf                  5      r9 " S" S#\Rf                  5      r: " S$ S%\Rf                  5      r; " S& S'\Rf                  5      r< " S( S)\5      r= " S* S+\5      r> " S, S-\5      r? " S. S/\Rf                  5      r@ " S0 S1\=5      rAS2rB\" \A\,\B-   5        \" \A\\&S39   " S4 S5\Rf                  5      rC " S6 S7\=5      rDS8rE\" \D\,\E-   5        \" \D\1\&S39   " S9 S:\Rf                  5      rF " S; S<\>5      rGS=rH\" \G\-\H-   5        \" \G\\'S39   " S> S?\Rf                  5      rI\"" \+5       " S@ SA\?5      5       rJSBrK\" \J\.\K-   5        \" \J\2\%S39  / SCQrLg)D    )AnyOptionalUnionN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxBaseModelOutputWithPooling)ACT2FNFlaxPreTrainedModel append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstringslogging   )
CLIPConfigCLIPTextConfigCLIPVisionConfiga  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a~  
    Args:
        input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aA  
    Args:
        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       \ rS rSr% SrSr\R                  \S'   Sr	\R                  \S'   Sr
\\\R                  S4      \S'   Sr\\\R                  S4      \S'   S	rg)
FlaxCLIPTextModelOutput   a  
Base class for text model's outputs that also contains a pooling of the last hidden states.

Args:
    text_embeds (`jnp.ndarray` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of
        [`FlaxCLIPTextModel`].
    last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Ntext_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r    jnpndarray__annotations__r!   r"   r   tupler#   __static_attributes__r$       e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/clip/modeling_flax_clip.pyr   r      s`    ,  $K#%)s{{)7;M8E#++s"234;48Js{{C/018r/   r   c                       \ rS rSr% SrSr\R                  \S'   Sr	\R                  \S'   Sr
\R                  \S'   Sr\R                  \S'   Sr\\S'   Sr\\S	'   S
\\   4S jrSrg)FlaxCLIPOutput   a   
Args:
    logits_per_image:(`jnp.ndarray` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text:(`jnp.ndarray` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of
        [`FlaxCLIPTextModel`].
    image_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`FlaxCLIPVisionModel`].
    text_model_output(`FlaxBaseModelOutputWithPooling`):
        The output of the [`FlaxCLIPTextModel`].
    vision_model_output(`FlaxBaseModelOutputWithPooling`):
        The output of the [`FlaxCLIPVisionModel`].
Nlogits_per_imagelogits_per_textr    image_embedstext_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r7   r8   N)getattrto_tuple).0kselfs     r0   	<genexpr>*FlaxCLIPOutput.to_tuple.<locals>.<genexpr>   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)r-   keysr@   s   `r0   r=   FlaxCLIPOutput.to_tuple   s#     
YY[
 
 	
r/   r$   )r%   r&   r'   r(   r)   r4   r*   r+   r,   r5   r    r6   r7   r   r8   r-   r   r=   r.   r$   r/   r0   r2   r2      sj    ( %)ckk(#'OS[['#K# $L#++$8<5<:>7>
%* 
r/   r2   c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxCLIPVisionEmbeddings   configdtypec           
         U R                   R                  nU R                   R                  nU R                   R                  nU R	                  S[
        R                  R                  R                  SS9U45      U l	        [        R                  " UX34X34SSU R                  [
        R                  R                  R                  5       S9U l        X#-  S-  U l        U R                  S-   n[        R                  " XA[
        R                  R                  R                  5       S	9U l        [         R"                  " [         R$                  " S
USS9S
S9U l        g )Nclass_embedding{Gz?)stddevVALIDF)kernel_sizestridespaddinguse_biasrJ   kernel_init   r   embedding_initr   i4rJ   axis)rI   hidden_size
image_size
patch_sizeparamjaxnninitializersnormalrL   ConvrJ   patch_embeddingnum_patchesEmbedposition_embeddingr*   expand_dimsarangeposition_ids)r@   	embed_dimr]   r^   num_positionss        r0   setupFlaxCLIPVisionEmbeddings.setup   s   KK++	[[++
[[++
#zz*;SVV=P=P=W=W_c=W=dgpfrs!ww#0,**++224 
 '4:((1,"$((=TWTZTZTgTgTnTnTp"qOOCJJq-t,T[\]r/   c                 R   U R                  U5      nUR                  u  p4pV[        R                  " X#XE-  U45      n[        R                  " U R
                  SS9n[        R                  " XsSS45      n[        R                  " Xr/SS9nXR                  U R                  5      -   nU$ )Nr   r   rZ   r   )
re   shaper*   reshaperi   rL   tileconcatenaterh   rk   )	r@   pixel_valuespatch_embeds
batch_sizeheightwidthchannelsclass_embeds
embeddingss	            r0   __call__!FlaxCLIPVisionEmbeddings.__call__   s    ++L9.:.@.@+
E{{<fnh1WXt';';&Ixx1a.@A__l%AJ
"9"9$:K:K"LL
r/   )rL   rf   re   rh   rk   N)r%   r&   r'   r(   r   r,   r*   float32rJ   rn   r~   r.   r$   r/   r0   rG   rG      s%    {{E399"^,	r/   rG   c                   b    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	S r
Srg)FlaxCLIPTextEmbeddingsi  rI   rJ   c                    U R                   R                  n[        R                  " U R                   R                  U[
        R                  R                  R                  5       S9U l        [        R                  " U R                   R                  U[
        R                  R                  R                  5       S9U l
        [        R                  " [        R                  " SU R                   R                  SS9SS9U l        g )NrV   r   rX   rY   rq   rZ   )rI   r\   ra   rg   
vocab_sizer`   rb   rc   token_embeddingmax_position_embeddingsrh   r*   ri   rj   rk   )r@   rl   s     r0   rn   FlaxCLIPTextEmbeddings.setup  s    KK++	!xx(>(>	Z]Z`Z`ZmZmZtZtZvw"$((KK//366K^K^KeKeKg#
  OOJJq$++==TJQW
r/   c                     U R                  UR                  S5      5      nU R                  UR                  S5      5      nX4-   nU$ )NrX   )r   astyperh   )r@   	input_idsrk   input_embedsposition_embedsr}   s         r0   r~   FlaxCLIPTextEmbeddings.__call__  sF    ++I,<,<T,BC11,2E2Ed2KL!3
r/   )rh   rk   r   N)r%   r&   r'   r(   r   r,   r*   r   rJ   rn   r~   r.   r$   r/   r0   r   r     s$    {{E399"	
r/   r   c                       \ rS rSr% \\\4   \S'   \R                  r
\R                  \S'   S rS rS r   SS\S	\4S
 jjrSrg)FlaxCLIPAttentioni  rI   rJ   c                 (   U R                   R                  U l        U R                   R                  U l        U R                  U R                  -  U l        U R
                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R
                  S-  U l        U R                   R                  U l	        [        R                  " U R                  U R                  [        R                  R                  R                  S5      S9U l        [        R                  " U R                  U R                  [        R                  R                  R                  S5      S9U l        [        R                  " U R                  U R                  [        R                  R                  R                  S5      S9U l        [        R                  " U R                  U R                  [        R                  R                  R                  S5      S9U l        [)        U R                   [*        5      U l        U R,                  (       a:  [/        [0        R2                  " SU R                   R4                  4SS	95      U l        g g )
Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      {Gz?rJ   rT   r   rX   rY   )rI   r\   rl   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutdropoutra   DenserJ   r`   rb   rc   k_projv_projq_projout_proj
isinstancer   causalr
   r*   onesr   causal_maskrD   s    r0   rn   FlaxCLIPAttention.setup"  s   0088$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
{{44hht~~TZZSVVM`M`MgMghlMmnhht~~TZZSVVM`M`MgMghlMmnhht~~TZZSVVM`M`MgMghlMmntzzsvvObObOiOijnOop n=;;/!T[[=`=`9aim0noD r/   c                 p    UR                  UR                  S S U R                  U R                  4-   5      $ NrU   )rs   rr   r   r   r@   r"   s     r0   _split_headsFlaxCLIPAttention._split_heads7  s5    $$]%8%8!%<PTP]P]?^%^__r/   c                 Z    UR                  UR                  S S U R                  4-   5      $ r   )rs   rr   rl   r   s     r0   _merge_headsFlaxCLIPAttention._merge_heads:  s,    $$]%8%8!%<?P%PQQr/   Ndeterministicoutput_attentionsc                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nS nU R                  (       a:  UR
                  S   UR
                  S   pU R                  S S 2S S 2X-
  U
2S U
24   nUb#  Ub   [        R                  " USS9n[        X(SS9nOUb  UnOUb  [        R                  " USS9nUb  [        R                  " US:  [        R                  " UR
                  S5      R                  U R                  5      [        R                  " UR
                  [        R                  " U R                  5      R                   5      R                  U R                  5      5      nOS nS nU(       d!  U R"                  S:  a  U R%                  S5      n['        UUUUU R"                  UU R                  S S	9n[        R(                  " S
X5      nU R+                  U5      nU R-                  U5      nU(       a  X4nU$ U4nU$ )Nr   )rZ   rX   rY   r   g        r   )biasdropout_rngdropout_rater   rJ   	precisionz...hqk,...khd->...qhd)r   r   r   r   r   rr   r   r*   ri   r	   r   selectfullr   rJ   finfominr   make_rngr   einsumr   r   )r@   r"   attention_maskr   r   querykeyvaluecausal_attention_maskquery_length
key_lengthattention_biasr   attn_weightsattn_outputoutputss                   r0   r~   FlaxCLIPAttention.__call__=  s    M*kk-(M*!!%($!!%( $;;',{{1~syy|*$($4$4Q:;TWa;acndncn5n$o!%*?*K __^(KN*>X\]N".2N' __^(KN% ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!3--	2K4#'**	
 jj!8,N''4mmK01B;- JUr/   )r   r   r   rl   r   r   r   r   r   r   r   )NTF)r%   r&   r'   r(   r   r   r   r,   r*   r   rJ   rn   r   r   boolr~   r.   r$   r/   r0   r   r     s`    ."2233{{E399"p*`R ""'9 	9
  9 9r/   r   c                   l    \ rS rSr% \\\4   \S'   \R                  r
\R                  \S'   S rS rSrg)FlaxCLIPMLPiy  rI   rJ   c                    [         U R                  R                     U l        [        R
                  " U R                  R                  U R                  [        R                  R                  R                  S5      S9U l        [        R
                  " U R                  R                  U R                  [        R                  R                  R                  S5      S9U l        g )Nr   r   )r   rI   
hidden_actactivation_fnra   r   intermediate_sizerJ   r`   rb   rc   fc1r\   fc2rD   s    r0   rn   FlaxCLIPMLP.setup}  s    #DKK$:$:;88KK))**++2248

 88DKK334::SVSYSYSfSfSmSmnrSstr/   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ N)r   r   r   r   s     r0   r~   FlaxCLIPMLP.__call__  s4    /**=9/r/   )r   r   r   N)r%   r&   r'   r(   r   r   r   r,   r*   r   rJ   rn   r~   r.   r$   r/   r0   r   r   y  s0    ."2233{{E399"ur/   r   c                       \ rS rSr% \\\4   \S'   \R                  r
\R                  \S'   S r  S
S\S\4S jjrSrg	)FlaxCLIPEncoderLayeri  rI   rJ   c                 p   [        U R                  U R                  S9U l        [        R
                  " U R                  R                  U R                  S9U l        [        U R                  U R                  S9U l	        [        R
                  " U R                  R                  U R                  S9U l
        g NrY   )epsilonrJ   )r   rI   rJ   	self_attnra   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rD   s    r0   rn   FlaxCLIPEncoderLayer.setup  sv    *4;;djjI<<0J0JRVR\R\]t{{$**=<<0J0JRVR\R\]r/   r   r   c                     UnU R                  U5      nU R                  UUUUS9nUS   nXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  XvSS  -  nU$ )N)r"   r   r   r   r   r   )r   r   r   r   )r@   r"   r   r   r   residualattn_outputsr   s           r0   r~   FlaxCLIPEncoderLayer.__call__  s     !((7~~')'/	 & 
 %Q 0 ((7/ 0 "AB''Gr/   )r   r   r   r   N)TFr%   r&   r'   r(   r   r   r   r,   r*   r   rJ   rn   r   r~   r.   r$   r/   r0   r   r     sQ    ."2233{{E399"^ #"' 	
   r/   r   c            	           \ rS rSr% \\\4   \S'   \R                  r
\R                  \S'   S r     SS\S\S\S	\4S
 jjrSrg)FlaxCLIPLayerCollectioni  rI   rJ   c           	          [        U R                  R                  5       Vs/ s H+  n[        U R                  [	        U5      U R
                  S9PM-     snU l        g s  snf )N)namerJ   )rangerI   num_hidden_layersr   strrJ   layers)r@   is     r0   rn   FlaxCLIPLayerCollection.setup  sL     4;;889
9 !3q6L9
 
s   2ANr   r   output_hidden_statesreturn_dictc                 
   U(       a  SOS nU(       a  SOS nU R                    H,  n	U(       a  X4-  nU	" XX4S9n
U
S   nU(       d  M$  XzS   4-  nM.     U(       a  X4-  nU4nU(       d  [        S U 5       5      $ [        XUS9$ )Nr$   )r   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r$   )r>   vs     r0   rA   3FlaxCLIPLayerCollection.__call__.<locals>.<genexpr>  s     =GqGs   	)r!   r"   r#   )r   r-   r   )r@   r"   r   r   r   r   r   all_attentionsall_hidden_stateslayerlayer_outputsr   s               r0   r~    FlaxCLIPLayerCollection.__call__  s      1d"6BD[[E#!%55!!]M *!,M  #3"55 !  !11 "=G==="+Yg
 	
r/   r   NTFFTr   r$   r/   r0   r   r     sm    ."2233{{E399"
 ""'%* "
 	"

  "
 #"
 "
 "
r/   r   c            	           \ rS rSr% \\\4   \S'   \R                  r
\R                  \S'   S r     SS\S\S\S	\4S
 jjrSrg)FlaxCLIPEncoderi  rI   rJ   c                 J    [        U R                  U R                  S9U l        g NrY   )r   rI   rJ   r   rD   s    r0   rn   FlaxCLIPEncoder.setup  s    -dkkLr/   Nr   r   r   r   c           	      *    U R                  UUUUUUS9$ )N)r"   r   r   r   r   r   r   )r@   inputs_embedsr   r   r   r   r   s          r0   r~   FlaxCLIPEncoder.__call__  s,     {{')'/!5#  
 	
r/   r   r   r   r$   r/   r0   r   r     sn    ."2233{{E399"M ""'%* 
 	

  
 #
 
 
r/   r   c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxCLIPTextTransformeri   rI   rJ   c                 6   [        U R                  U R                  S9U l        [	        U R                  U R                  S9U l        [        R                  " U R                  R                  U R                  S9U l	        U R                  R                  U l
        g r   )r   rI   rJ   r}   r   encoderra   r   r   final_layer_normeos_token_idrD   s    r0   rn   FlaxCLIPTextTransformer.setup  sf    0DJJO&t{{$**E "T[[5O5OW[WaWa b !KK44r/   r   r   r   r   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  XS9nU R                  UUUUUUS9n	U	S   n
U R                  U
5      n
U R                  S:X  a6  U
[        R                  " U
R                  S   5      UR                  SS94   nOAU
[        R                  " U
R                  S   5      XR                  :H  R                  SS94   nU(       d	  X4U	SS  -   $ [        U
UU	R                  U	R                  S9$ )	N)r   rk   )r  r   r   r   r   r   r   rU   rZ   r   r!   pooler_outputr"   r#   )rI   r   r   use_return_dictr}   r  r  r	  r*   rj   rr   argmaxr   r"   r#   )r@   r   r   rk   r   r   r   r   r"   encoder_outputsr!   pooled_outputs               r0   r~    FlaxCLIPTextTransformer.__call__  sk    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])W,,')'/!5# ' 
 ,A. 112CD! .cjj9J9P9PQR9S.TV_VfVflnVfVo.opM .

,22156FWFW9W8_8_eg8_8hhM %58KKK-/')77&11	
 	
r/   )r}   r  r	  r  NTFFTr%   r&   r'   r(   r   r,   r*   r   rJ   rn   r   r~   r.   r$   r/   r0   r  r     s_    {{E399"5 #"'%* 3

 3
  3
 #3
 3
 3
r/   r  c                   |    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	     S
S\
S\
4S jjrS	rg)FlaxCLIPVisionTransformeriB  rI   rJ   c                 p   [        U R                  U R                  S9U l        [        R
                  " U R                  R                  U R                  S9U l        [        U R                  U R                  S9U l	        [        R
                  " U R                  R                  U R                  S9U l
        g r   )rG   rI   rJ   r}   ra   r   r   pre_layrnormr   r  post_layernormrD   s    r0   rn   FlaxCLIPVisionTransformer.setupF  sv    24;;djjQLL1K1KSWS]S]^&t{{$**E ll4;;3M3MUYU_U_`r/   Nr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nU R                  U5      nU R                  UUUUUS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )N)r  r   r   r   r   r   r   r  )rI   r   r   r  r}   r  r  r  r   r"   r#   )
r@   rv   r   r   r   r   r"   r  r!   r  s
             r0   r~   "FlaxCLIPVisionTransformer.__call__L  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]5))-8,,''/!5# ' 
 ,A.)!Q'2++M:%58KKK-/')77&11	
 	
r/   )r}   r  r  r  )NTNNTr%   r&   r'   r(   r   r,   r*   r   rJ   rn   r   r~   r.   r$   r/   r0   r  r  B  sO    {{E399"a "! %
 %
 %
 %
r/   r  c                   T  ^  \ rS rSr% \rSr\R                  \	S'   SS\
R                  S4S\S\S	\
R                  S
\4U 4S jjjrSS\R"                  R$                  S\S\S\4S jjr        SS\\   S\R"                  R$                  S\S\\   S\\   S\\   4S jjrSrU =r$ )FlaxCLIPTextPreTrainedModelit  Nmodule_classr   r   r   TrI   seedrJ   _do_initc           	      L   > U R                   " SXS.UD6n[        TU ]	  XX#XES9  g )NrI   rJ   input_shaper#  rJ   r$  r$   )r!  super__init__	r@   rI   r(  r#  rJ   r$  kwargsmodule	__class__s	           r0   r*  $FlaxCLIPTextPreTrainedModel.__init__x  s2     ""H&HH[SXlr/   rngr(  paramsr9   c                 J   [         R                  " USS9n[         R                  " [         R                  " [         R                  " U5      R
                  S   5      U5      n[         R                  " U5      n[        R                  R                  U5      u  pxXxS.n	U R                  R                  XXe5      S   n
Ubd  [        [        U
5      5      n
[        [        U5      5      nU R                   H	  nX   X;'   M     [        5       U l        [!        [#        U5      5      $ U
$ )NrX   rY   r  r1  r   r1  )r*   zerosbroadcast_torj   
atleast_2drr   	ones_liker`   randomsplitr-  initr   r   _missing_keyssetr   r   )r@   r0  r(  r1  r   rk   r   
params_rngr   rngsrandom_paramsmissing_keys               r0   init_weights(FlaxCLIPTextPreTrainedModel.init_weights  s    IIk6	''

3>>)3L3R3RSU3V(WYdey1"%**"2"23"7
$=((.WX`a(-)@AM!(6"23F#11&3&@#  2!$D.011  r/   r   trainr   r   r   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	UcV  [        R
                  " [        R                  " [        R                  " U5      R                  S   5      UR                  5      nUc  [        R                  " U5      n0 n
Ub  XZS'   U R                  R                  SU=(       d    U R                  0[        R                  " USS9[        R                  " USS9[        R                  " USS9U(       + UUU	U
S9	$ )Nr  r   r1  rX   rY   r>  )rI   r   r   r   r*   r5  rj   r6  rr   r7  r-  applyr1  array)r@   r   r   rk   r1  r   rC  r   r   r   r>  s              r0   r~   $FlaxCLIPTextPreTrainedModel.__call__  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N ")O{{  v,-IIit,IInD1IIl$/I  ! 

 
	
r/   r;  r   NNNNFNNN)r%   r&   r'   r(   r   config_classr!  ra   Moduler,   r*   r   intrJ   r   r*  r`   r8  PRNGKeyr-   r   rA  r   dictr~   r.   __classcell__r.  s   @r0   r   r   t  s	   !L"L"))"
 ;;
m
m 	
m
 yy
m 
m 
m!

 2 2 ! !PZ !fp !0 !%*.,0/3&*'

 '
 ZZ'''
 '
 $D>'
 'tn'
 d^'
 '
r/   r   c                   ^  ^  \ rS rSr% \rSrSr\R                  \
S'   SS\R                  S4S\S\\   S	\S
\R                   S\4
U 4S jjjrSS\R(                  R*                  S\S\S\4S jjr      SS\\   S\R(                  R*                  S\S\\   S\\   S\\   4S jjrSrU =r$ )FlaxCLIPVisionPreTrainedModeli  rv   Nr!  r   TrI   r(  r#  rJ   r$  c           	         > Uc  SUR                   UR                   S4nU R                  " SXS.UD6n[        TU ]  XX#XES9  g )Nr   r   r&  r'  r$   )r]   r!  r)  r*  r+  s	           r0   r*  &FlaxCLIPVisionPreTrainedModel.__init__  sR     f//1B1BAFK""H&HH[SXlr/   r0  r1  r9   c                    [         R                  R                  X5      n[         R                  R                  U5      u  pVXVS.nU R                  R                  Xt5      S   nUbd  [        [        U5      5      n[        [        U5      5      nU R                   H	  n	X   X9'   M     [        5       U l        [        [        U5      5      $ U$ )Nr3  r1  )r`   r8  rc   r9  r-  r:  r   r   r;  r<  r   r   )
r@   r0  r(  r1  rv   r=  r   r>  r?  r@  s
             r0   rA  *FlaxCLIPVisionPreTrainedModel.init_weights  s    zz((:"%**"2"23"7
$=((<XF(-)@AM!(6"23F#11&3&@#  2!$D.011  r/   r   rC  r   r   r   c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  n[        R
                  " US5      n0 nUb  X8S'   U R                  R                  SU=(       d    U R                  0[        R                  " U[        R                  S9U(       + UUUUS9$ )Nr   rU   r   r   r   r1  rY   rE  )rI   r   r   r   r*   	transposer-  rF  r1  rG  r   )	r@   rv   r1  r   rC  r   r   r   r>  s	            r0   r~   &FlaxCLIPVisionPreTrainedModel.__call__  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY}}\<@ ")O{{  v,-IIl#++6I  ! 
 	
r/   rI  r   )NNFNNN)r%   r&   r'   r(   r   rK  main_input_namer!  ra   rL  r,   r*   r   r   r-   rM  rJ   r   r*  r`   r8  rN  r   rA  rO  r~   r.   rP  rQ  s   @r0   rS  rS    s   #L$O"L"))"
 (,;;m m e_m 	m
 yym m m!

 2 2 ! !PZ !fp !, "&*.,0/3&*
 
 ZZ''	

 
 $D>
 'tn
 d^
 
r/   rS  c                     ^  \ rS rSr% \rSr\R                  \	S'   SS\
R                  S4S\S\\   S\S	\
R                  S
\4
U 4S jjjrSS\R&                  R(                  S\S\S\4S jjr        SS\\   S\R&                  R(                  S\S\\   S\\   S\\   4S jjr     SS\\   S\R&                  R(                  4S jjr SS\\   S\R&                  R(                  4S jjrSrU =r$ )FlaxCLIPPreTrainedModeli  Nr!  r   TrI   r(  r#  rJ   r$  c           	         > Uc0  SSUR                   R                  UR                   R                  S44nU R                  " SXS.UD6n[        TU ]  XX#XES9  g )Nr"  r   r   r&  r'  r$   )vision_configr]   r!  r)  r*  r+  s	           r0   r*   FlaxCLIPPreTrainedModel.__init__  sd     !Av';';'F'FH\H\HgHgij#klK""H&HH[SXlr/   r0  r1  r9   c                    [         R                  " US   SS9n[         R                  " [         R                  " [         R                  " U5      R
                  S   5      US   5      n[         R                  " U5      n[        R                  R                  XS   5      n[        R                  R                  U5      u  pXS.n
U R                  R                  XXvU5      S   nUbd  [        [        U5      5      n[        [        U5      5      nU R                   H	  nX   X<'   M     [!        5       U l        [#        [%        U5      5      $ U$ )Nr   rX   rY   r  r   r3  r1  )r*   r4  r5  rj   r6  rr   r7  r`   r8  rc   r9  r-  r:  r   r   r;  r<  r   r   )r@   r0  r(  r1  r   rk   r   rv   r=  r   r>  r?  r@  s                r0   rA  $FlaxCLIPPreTrainedModel.init_weights  s   IIk!nD9	''

3>>)3L3R3RSU3V(WYdefYghy1zz((!n="%**"2"23"7
$=((,Xdefno(-)@AM!(6"23F#11&3&@#  2!$D.011  r/   r   rC  r   r   r   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
UcV  [        R
                  " [        R                  " [        R                  " U5      R                  S   5      UR                  5      nUc  [        R                  " U5      n[        R                  " US5      n0 nUb  XkS'   U R                  R                  SU=(       d    U R                  0[        R                  " USS9[        R                  " U[        R                  S9[        R                  " USS9[        R                  " USS9U(       + UU	U
US9
$ )Nr  rY  r   r1  rX   rY   rE  )rI   r   r   r   r*   r5  rj   r6  rr   r7  rZ  r-  rF  r1  rG  r   )r@   r   rv   r   rk   r1  r   rC  r   r   r   r>  s               r0   r~    FlaxCLIPPreTrainedModel.__call__4  sA    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N}}\<@ ")O{{  v,-IIit,IIl#++6IInD1IIl$/I  ! 
 	
r/   c           
         UcV  [         R                  " [         R                  " [         R                  " U5      R                  S   5      UR                  5      nUc  [         R
                  " U5      n0 nUb  XWS'   S nU R                  R                  SU=(       d    U R                  0[         R                  " USS9[         R                  " USS9[         R                  " USS9U(       + UUS9$ )a  
Args:
    input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)

Returns:
    text_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
    the projection layer to the pooled output of [`FlaxCLIPTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, FlaxCLIPModel

>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
>>> text_features = model.get_text_features(**inputs)
```r  r   c                 V    U R                  UUUUS9nUS   nU R                  U5      nU$ )N)r   r   rk   r   r   
text_modeltext_projection)r-  r   r   rk   r   text_outputsr  text_featuress           r0   _get_features@FlaxCLIPPreTrainedModel.get_text_features.<locals>._get_features  sD    !,,#-)+	 - L )OM"22=AM  r/   r1  rX   rY   methodr>  )
r*   r5  rj   r6  rr   r7  r-  rF  r1  rG  )	r@   r   r   rk   r1  r   rC  r>  rm  s	            r0   get_text_features)FlaxCLIPPreTrainedModel.get_text_featuresa  s    F ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N ")O		! {{  v,-IIit,IInD1IIl$/I  ! 
 	
r/   c                     [         R                  " US5      n0 nUb  X5S'   S nU R                  R                  SU=(       d    U R                  0[         R
                  " U[         R                  S9U(       + UUS9$ )a  
Args:
    pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
        Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
        using [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.

Returns:
    image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`FlaxCLIPVisionModel`]

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, FlaxCLIPModel

>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="np")

>>> image_features = model.get_image_features(**inputs)
```rY  r   c                 P    U R                  XS9nUS   nU R                  U5      nU$ )N)rv   r   r   )vision_modelvisual_projection)r-  rv   r   vision_outputsr  image_featuress         r0   rm  AFlaxCLIPPreTrainedModel.get_image_features.<locals>._get_features  s5    #00l0hN*1-M#55mDN!!r/   r1  rY   ro  )r*   rZ  r-  rF  r1  rG  r   )r@   rv   r1  r   rC  r>  rm  s          r0   get_image_features*FlaxCLIPPreTrainedModel.get_image_features  sy    < }}\<@ ")O	" {{  v,-IIl#++6I  ! 
 	
r/   rI  r   rJ  )NNNNF)NNF)r%   r&   r'   r(   r   rK  r!  ra   rL  r,   r*   r   r   r-   rM  rJ   r   r*  r`   r8  rN  r   rA  rO  r~   rq  rz  r.   rP  rQ  s   @r0   r^  r^    s   L"L"))"
 (,;;mm e_m 	m
 yym m m!

 2 2 ! !PZ !fp !6 !%*.,0/3&*+
 +
 ZZ''+
 +
 $D>+
 'tn+
 d^+
` !%*.A

 A
 ZZ''A
H jo1
$,TN1
HK

HZHZ1
 1
r/   r^  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxCLIPTextModulei  rI   rJ   c                 J    [        U R                  U R                  S9U l        g r   )r  rI   rJ   ri  rD   s    r0   rn   FlaxCLIPTextModule.setup  s    1$++TZZPr/   r   r   r   r   c           
      ,    U R                  UUUUUUUS9$ )Nr   r   rk   r   r   r   r   ri  )r@   r   r   rk   r   r   r   r   s           r0   r~   FlaxCLIPTextModule.__call__  s/     )%'/!5#  
 	
r/   r  Nr  r  r$   r/   r0   r}  r}    s`    {{E399"Q #"'%* 

 
  
 #
 
 
r/   r}  c                       \ rS rSr\rSrg)FlaxCLIPTextModeli  r$   N)r%   r&   r'   r(   r}  r!  r.   r$   r/   r0   r  r    s    %Lr/   r  a'  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxCLIPTextModel

    >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> last_hidden_state = outputs.last_hidden_state
    >>> pooler_output = outputs.pooler_output  # pooled (EOS token) states
    ```
)output_typerK  c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)%FlaxCLIPTextModelWithProjectionModulei  rI   rJ   c                     [        U R                  U R                  S9U l        [        R
                  " U R                  R                  SU R                  S9U l        g )NrY   F)rS   rJ   )r  rI   rJ   ri  ra   r   projection_dimrj  rD   s    r0   rn   +FlaxCLIPTextModelWithProjectionModule.setup  s>    1$++TZZP!xx(B(BUZ^ZdZder/   r   r   r   r   c           
          U R                  UUUUUUUS9nUS   n	U R                  U	5      n
U(       d  XS   4USS  -   $ [        U
UR                  UR                  UR
                  S9$ )Nr  r   r   rU   )r    r!   r"   r#   )ri  rj  r   r!   r"   r#   )r@   r   r   rk   r   r   r   r   rk  r  r    s              r0   r~   .FlaxCLIPTextModelWithProjectionModule.__call__  s     )%'/!5# ' 
 %Q**=9a1L4DDD&#*<<&44#..	
 	
r/   rh  Nr  r  r$   r/   r0   r  r    s`    {{E399"f #"'%* 

 
  
 #
 
 
r/   r  c                       \ rS rSr\rSrg)FlaxCLIPTextModelWithProjectioni;  r$   N)r%   r&   r'   r(   r  r!  r.   r$   r/   r0   r  r  ;  s    8Lr/   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxCLIPTextModelWithProjection

    >>> model = FlaxCLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> text_embeds = outputs.text_embeds
    ```
c            	           \ rS rSr% \\S'   \R                  r\R                  \S'   S r	    SS\
S\
S\
S\
4S	 jjrS
rg)FlaxCLIPVisionModuleiY  rI   rJ   c                 J    [        U R                  U R                  S9U l        g r   )r  rI   rJ   ru  rD   s    r0   rn   FlaxCLIPVisionModule.setup]  s    5dkkTr/   r   r   r   r   c                 (    U R                  UUUUUS9$ )Nrv   r   r   r   r   ru  )r@   rv   r   r   r   r   s         r0   r~   FlaxCLIPVisionModule.__call__`  s+       %'/!5# ! 
 	
r/   r  Nr  r  r$   r/   r0   r  r  Y  s`    {{E399"U #"'%* 
 
  	

 #
 
 
r/   r  c                       \ rS rSr\rSrg)FlaxCLIPVisionModeliq  r$   N)r%   r&   r'   r(   r  r!  r.   r$   r/   r0   r  r  q  s    'Lr/   r  a  
    Returns:

    Example:

    ```python
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, FlaxCLIPVisionModel

    >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> inputs = processor(images=image, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> last_hidden_state = outputs.last_hidden_state
    >>> pooler_output = outputs.pooler_output  # pooled CLS states
    ```
c                   ~    \ rS rSr% \\S'   \R                  r\R                  \S'   S r	        S	S\
4S jjrSrg)
FlaxCLIPModulei  rI   rJ   c                   ^  T R                   R                  nT R                   R                  nT R                   R                  T l        UR                  T l        UR                  T l        [        UT R                  S9T l	        [        UT R                  S9T l        [        R                  " T R                  T R                  [        R                  R                  R!                  S5      SS9T l        [        R                  " T R                  T R                  [        R                  R                  R!                  S5      SS9T l        T R'                  SU 4S j/ 5      T l        g )NrY   rM   F)rJ   rT   rS   logit_scalec                 ^   > [         R                  " U5      TR                  R                  -  $ r   )r*   r   rI   logit_scale_init_value)_rr   r@   s     r0   <lambda>&FlaxCLIPModule.setup.<locals>.<lambda>  s    CHHUOdkk>`>`,`r/   )rI   text_configr`  r  r\   text_embed_dimvision_embed_dimr  rJ   ri  r  ru  ra   r   r`   rb   rc   rv  rj  r_   r  )r@   r  r`  s   `  r0   rn   FlaxCLIPModule.setup  s
   kk--11"kk88)55 - 9 91+TZZP5m4::V!#**++2248	"
  "xx**++2248	 
  ::`bd
r/   Nr   c	           
      :   Ub  UOU R                   R                  nU R                  UUUUUS9n	U R                  UUUUUUUS9n
U	S   nU R	                  U5      nU
S   nU R                  U5      nU[        R                  R                  USSS9-  nU[        R                  R                  USSS9-  n[        R                  " U R                  5      n[        R                  " XR                  5      U-  nUR                  nU(       d  XXX4$ [        UUUUU
U	S9$ )Nr  r  r   r  T)r[   keepdims)r4   r5   r    r6   r7   r8   )rI   r   ru  ri  rv  rj  r*   linalgnormexpr  matmulTr2   )r@   r   rv   r   rk   r   r   r   r   rw  rk  r6   r    r  r5   r4   s                   r0   r~   FlaxCLIPModule.__call__  sD    &1%<k$++BYBY**%'/!5# + 
 )%'/!5# ' 
 &a(--l;"1o**;7 $cjjoolVZo&[[!CJJOOKbSWO$XX ggd../**[..AKO*,,${R^oo-+#%* .
 	
r/   )r  r  r  ri  rj  r  ru  rv  )NNNNTNNN)r%   r&   r'   r(   r   r,   r*   r   rJ   rn   r   r~   r.   r$   r/   r0   r  r    sM    {{E399"
< "!8
 8
 8
r/   r  c                       \ rS rSr\rSrg)FlaxCLIPModeli  r$   N)r%   r&   r'   r(   r  r!  r.   r$   r/   r0   r  r    s    !Lr/   r  ai  
    Returns:

    Example:

    ```python
    >>> import jax
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, FlaxCLIPModel

    >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> inputs = processor(
    ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True
    ... )

    >>> outputs = model(**inputs)
    >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    >>> probs = jax.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
    ```
)r  r^  r  r   r  r  rS  )Mtypingr   r   r   flax
flax.linenlinenra   r`   	jax.numpynumpyr*   flax.core.frozen_dictr   r   r   r	   r
   flax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_clipr   r   r   
get_loggerr%   loggerCLIP_START_DOCSTRINGCLIP_TEXT_INPUTS_DOCSTRINGCLIP_VISION_INPUTS_DOCSTRINGCLIP_INPUTS_DOCSTRINGstruct	dataclassr   r2   rL  rG   r   r   r   r   r   r   r  r  r   rS  r^  r}  r  FLAX_CLIP_TEXT_MODEL_DOCSTRINGr  r  .FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRINGr  r   FLAX_CLIP_VISION_MODEL_DOCSTRINGr  r  FLAX_CLIP_MODEL_DOCSTRING__all__r$   r/   r0   <module>r     s    ( '   
  > > 6 > ;  X  @ ? L L 
		H	%! F @  ! H 9k 9 9:  
[  
  
F#ryy #LRYY .X		 Xv")) ('299 'T,
bii ,
^
bii 
4?
bii ?
D/
		 /
dL
"5 L
^E
$7 E
PJ
1 J
Z
 
8&3 &" & *,FIg,g h  #AP^
'
BII '
T9&A 92 .$ #%?Bp%p !#1HWe

299 
0(7 ($  0 ,.JMm.m n  %CRb
X
RYY X
v *+"+ " ," 6 (=@Y(Y Z  NYc dr/   