
    bCi                       S r SSKJr  SSKrSSKJr  SSKrSSK	J
r
  SSKJrJrJrJrJrJrJr  SSKJrJr  SS	KJrJrJrJrJr  S
SKJr  \R<                  " \5      r Sr!\ " S S\5      5       r" " S S\RF                  RH                  5      r% " S S\RF                  RH                  5      r& " S S\RF                  RH                  5      r' " S S\RF                  RH                  5      r( " S S\RF                  RH                  5      r) " S S\RF                  RH                  5      r* " S S\RF                  RH                  5      r+ " S S\RF                  RH                  5      r, " S S \RF                  RH                  5      r- " S! S"\RF                  RH                  5      r. " S# S$\RF                  RH                  5      r/ " S% S&\RF                  RH                  5      r0 " S' S(\RF                  RH                  5      r1 " S) S*\RF                  RH                  5      r2\ " S+ S,\RF                  RH                  5      5       r3 " S- S.\5      r4S/r5S0r6\" S1\55       " S2 S3\45      5       r7\" S4\55       " S5 S6\4\5      5       r8/ S7Qr9g)8zTF 2.0 Cvt model.    )annotationsN)	dataclass   )&TFImageClassifierOutputWithNoAttention)TFModelInputTypeTFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	CvtConfigr   c                  D    \ rS rSr% SrSrS\S'   SrS\S'   SrS\S'   S	r	g)
TFBaseModelOutputWithCLSToken3   a  
Base class for model's outputs.

Args:
    last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
        Classification token at the output of the last layer of the model.
    hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
        `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
        the initial embedding outputs.
Ntf.Tensor | Nonelast_hidden_statecls_token_valueztuple[tf.Tensor, ...] | Nonehidden_states )
__name__
__module____qualname____firstlineno____doc__r   __annotations__r   r   __static_attributes__r       a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/cvt/modeling_tf_cvt.pyr   r   3   s+     +/'.(,O%,26M/6r&   r   c                  >   ^  \ rS rSrSrSU 4S jjrSSS jjrSrU =r$ )	TFCvtDropPathH   zDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
    (1) github.com:rwightman/pytorch-image-models
c                2   > [         TU ]  " S0 UD6  Xl        g )Nr   )super__init__	drop_prob)selfr.   kwargs	__class__s      r'   r-   TFCvtDropPath.__init__N   s    "6""r&   c                f   U R                   S:X  d  U(       d  U$ SU R                   -
  n[        R                  " U5      S   4S[        [        R                  " U5      5      S-
  -  -   nU[        R                  R                  USSU R                  S9-   n[        R                  " U5      nX-  U-  $ )N        r   r   )r   )dtype)r.   tfshapelenrandomuniformcompute_dtypefloor)r/   xtraining	keep_probr7   random_tensors         r'   callTFCvtDropPath.callR   s    >>S H&	!Q!DC,<q,@$AA!BII$5$5eQI[I[$5$\\/..r&   )r.   )r.   floatN)r=   	tf.Tensor)	r   r    r!   r"   r#   r-   rA   r%   __classcell__r1   s   @r'   r)   r)   H   s    
#/ /r&   r)   c                  d   ^  \ rS rSrSr              SU 4S jjrSS	S jjrS
S jrSrU =r	$ )TFCvtEmbeddings\   z-Construct the Convolutional Token Embeddings.c           
        > [         T	U ]  " S0 UD6  [        UUUUUUSS9U l        [        R
                  R                  U5      U l        g )Nconvolution_embeddings)
patch_sizenum_channels	embed_dimstridepaddingnamer   )r,   r-   TFCvtConvEmbeddingsrL   r   layersDropoutdropout)
r/   configrM   rN   rO   rP   rQ   dropout_rater0   r1   s
            r'   r-   TFCvtEmbeddings.__init___   sO     	"6"&9!%)'
# ||++L9r&   c                F    U R                  U5      nU R                  X2S9nU$ Nr>   )rL   rV   )r/   pixel_valuesr>   hidden_states       r'   rA   TFCvtEmbeddings.callv   s(    22<@||L|Dr&   c                   U R                   (       a  g SU l         [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       g = f)NTrL   )builtgetattrr6   
name_scoperL   rR   buildr/   input_shapes     r'   rd   TFCvtEmbeddings.build{   e    ::
4148Dt::??@++11$7 A@ E@@   A88
B)ra   rL   rV   )rW   r   rM   intrN   rj   rO   rj   rP   rj   rQ   rj   rX   rC   F)r]   rE   r>   boolreturnrE   rD   
r   r    r!   r"   r#   r-   rA   rd   r%   rF   rG   s   @r'   rI   rI   \   s^    7:: : 	:
 : : : :.
8 8r&   rI   c                  \   ^  \ rS rSrSr            SU 4S jjrSS jrS	S jrSrU =r	$ )
rS      zcImage to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.c           
       > [         TU ]  " S	0 UD6  [        R                  R	                  US9U l        [        U[        R                  R                  5      (       a  UOX"4U l
        [        R                  R                  UUUSS[        UR                  5      SS9U l        [        R                  R                  SSS9U l        X0l        X@l        g )
NrQ   validchannels_last
projection)filterskernel_sizestridesrQ   data_formatkernel_initializerrR   h㈵>normalizationepsilonrR   r   )r,   r-   r   rT   ZeroPadding2DrQ   
isinstancecollectionsabcIterablerM   Conv2Dr
   initializer_rangeru   LayerNormalizationr|   rN   rO   )	r/   rW   rM   rN   rO   rP   rQ   r0   r1   s	           r'   r-   TFCvtConvEmbeddings.__init__   s     	"6"||11'1B(2:{?W?W(X(X*_i^v,,--"'.v/G/GH . 
 #\\<<TP_<`("r&   c                   [        U[        5      (       a  US   nU R                  U R                  U5      5      n[	        U5      u  p#pEX4-  n[
        R                  " XXe4S9nU R                  U5      n[
        R                  " XX4U4S9nU$ )Nr]   r7   )r   dictru   rQ   r   r6   reshaper|   )r/   r]   
batch_sizeheightwidthrN   hidden_sizes          r'   rA   TFCvtConvEmbeddings.call   s    lD))'7Lt||L'AB 3=\2J/
Enzz,;6]^)),7 zz,6R^6_`r&   c                "   U R                   (       a  g SU l         [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S S U R                  /5        S S S 5        [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        g g ! , (       d  f       Ny= f! , (       d  f       g = f)NTru   r|   )
ra   rb   r6   rc   ru   rR   rd   rN   r|   rO   re   s     r'   rd   TFCvtConvEmbeddings.build   s    ::
4t,8t334%%tT49J9J&KL 54$/;t11667""(($dnn)EF 87 < 54 87s   *C/<)D /
C= 
D)ra   rO   r|   rN   rQ   rM   ru   )rW   r   rM   rj   rN   rj   rO   rj   rP   rj   rQ   rj   )r]   rE   rm   rE   rD   rn   rG   s   @r'   rS   rS      sV    m## # 	#
 # # #6 	G 	Gr&   rS   c                  H   ^  \ rS rSrSrSU 4S jjrSS	S jjrS
S jrSrU =r	$ ) TFCvtSelfAttentionConvProjection   zConvolutional projection layer.c                6  > [         TU ]  " S
0 UD6  [        R                  R	                  US9U l        [        R                  R                  UU[        UR                  5      SUSSUS9U l	        [        R                  R                  SSSS	9U l        X l        g )Nrr   rs   Fconvolution)rv   rw   rz   rQ   rx   use_biasrR   groupsr{   g?r|   )r~   momentumrR   r   )r,   r-   r   rT   r   rQ   r   r
   r   r   BatchNormalizationr|   rO   )r/   rW   rO   rw   rP   rQ   r0   r1   s          r'   r-   )TFCvtSelfAttentionConvProjection.__init__   s    "6"||11'1B <<..#.v/G/GH / 	
 #\\<<TTW^m<n"r&   c                d    U R                  U R                  U5      5      nU R                  XS9nU$ r[   )r   rQ   r|   r/   r^   r>   s      r'   rA   %TFCvtSelfAttentionConvProjection.call   s4    ''\(BC)),)Jr&   c                $   U R                   (       a  g SU l         [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S S U R                  /5        S S S 5        [        U SS 5      b]  [        R                  " U R                  R
                  5         U R                  R                  S S S U R                  /5        S S S 5        g g ! , (       d  f       Nz= f! , (       d  f       g = f)NTr   r|   )	ra   rb   r6   rc   r   rR   rd   rO   r|   re   s     r'   rd   &TFCvtSelfAttentionConvProjection.build   s    ::
4-9t//445  &&dD$..'IJ 64$/;t11667""(($dDNN)KL 87 < 65 87s   *C0<*D0
C>
D)ra   r   rO   r|   rQ   )
rW   r   rO   rj   rw   rj   rP   rj   rQ   rj   rk   r^   rE   r>   rl   rm   rE   rD   rn   rG   s   @r'   r   r      s    )#"
	M 	Mr&   r   c                  "    \ rS rSrSrSS jrSrg)"TFCvtSelfAttentionLinearProjection   z7Linear projection layer used to flatten tokens into 1D.c                V    [        U5      u  p#pEX4-  n[        R                  " XXe4S9nU$ )Nr   )r   r6   r   )r/   r^   r   r   r   rN   r   s          r'   rA   'TFCvtSelfAttentionLinearProjection.call   s1    2<\2J/
Enzz,;6]^r&   r   Nr^   rE   rm   rE   )r   r    r!   r"   r#   rA   r%   r   r&   r'   r   r      s
    Ar&   r   c                  d   ^  \ rS rSrSr S           SU 4S jjjrS	S
S jjrSS jrSrU =r	$ )TFCvtSelfAttentionProjection   z'Convolutional Projection for Attention.c           	     r   > [         TU ]  " S0 UD6  US:X  a  [        XX4USS9U l        [	        5       U l        g )Ndw_bnconvolution_projectionrR   r   )r,   r-   r   r   r   linear_projection)	r/   rW   rO   rw   rP   rQ   projection_methodr0   r1   s	           r'   r-   %TFCvtSelfAttentionProjection.__init__   sB     	"6"'*J;F^+D' "D!Er&   c                F    U R                  XS9nU R                  U5      nU$ r[   )r   r   r   s      r'   rA   !TFCvtSelfAttentionProjection.call  s+    22<2S--l;r&   c                   U R                   (       a  g SU l         [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       g = f)NTr   )ra   rb   r6   rc   r   rR   rd   re   s     r'   rd   "TFCvtSelfAttentionProjection.build  rh   ri   )ra   r   r   )r   )rW   r   rO   rj   rw   rj   rP   rj   rQ   rj   r   strrk   r   rD   rn   rG   s   @r'   r   r      sf    1 ")FF F 	F
 F F F F"
8 8r&   r   c                     ^  \ rS rSrSr S                       S	U 4S jjjrS
S jrSSS jjrSS jrSr	U =r
$ )TFCvtSelfAttentioni  z
Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
query, key, and value embeddings.
c           
       > [         TU ]  " S0 UD6  US-  U l        Xl        X0l        X l        [        UUUUUU	S:X  a  SOU	SS9U l        [        UUUUUU	SS9U l        [        UUUUUU	SS9U l	        [        R                  R                  U[        UR                  5      U
SS	S
9U l        [        R                  R                  U[        UR                  5      U
SSS
9U l        [        R                  R                  U[        UR                  5      U
SSS
9U l        [        R                  R%                  U5      U l        g )Ng      avglinearconvolution_projection_query)r   rR   convolution_projection_keyconvolution_projection_valuezerosprojection_queryunitsrz   r   bias_initializerrR   projection_keyprojection_valuer   )r,   r-   scalewith_cls_tokenrO   	num_headsr   r   r   r   r   rT   Denser
   r   r   r   r   rU   rV   )r/   rW   r   rO   rw   stride_q	stride_kv	padding_q
padding_kvqkv_projection_methodqkv_biasattention_drop_rater   r0   r1   s                 r'   r-   TFCvtSelfAttention.__init__  sg     	"6"_
,"",H*?5*HhNc/-
) +G3-+
' -I3/-
) !& 2 2.v/G/GH$# !3 !
 $ll00.v/G/GH$! 1 
 !& 2 2.v/G/GH$# !3 !
 ||++,?@r&   c                    [        U5      u  p#nU R                  U R                  -  n[        R                  " XX0R                  U4S9n[        R
                  " USS9nU$ )Nr   r      r   r   perm)r   rO   r   r6   r   	transpose)r/   r^   r   r   _head_dims         r'   "rearrange_for_multi_head_attention5TFCvtSelfAttention.rearrange_for_multi_head_attention_  sU    %/%="
>>T^^3zz,;P^P^`h6ij||L|Dr&   c                   U R                   (       a  [        R                  " USX#-  /S5      u  pQ[        U5      u  pgn[        R                  " XX#U4S9nU R                  XS9n	U R                  XS9n
U R                  XS9nU R                   (       aC  [        R                  " WU
4SS9n
[        R                  " XY4SS9n	[        R                  " X[4SS9nU R                  U R                  -  nU R                  U R                  U
5      5      n
U R                  U R                  U	5      5      n	U R                  U R                  U5      5      n[        R                  " XSS9U R                   -  n[#        USS9nU R%                  XS9n[        R                  " X5      n[        U5      u    nnn[        R&                  " US	S
9n[        R                  " XXpR                  U-  45      nU$ )Nr   r   r\   axisT)transpose_b)logitsr   r   r   )r   r6   splitr   r   r   r   r   concatrO   r   r   r   r   r   matmulr   r   rV   r   )r/   r^   r   r   r>   	cls_tokenr   r   rN   keyqueryvaluer   attention_scoreattention_probscontextr   s                    r'   rA   TFCvtSelfAttention.callf  s   &(hh|a=PRS&T#I 1;<0H-
zz,6R^6_`--l-N11,1R11,1RIIy%0q9E))Y,15CIIy0q9E>>T^^3778M8Me8TU55d6I6I#6NO778M8Me8TU))EDADJJN(bI,,,J))O3)'21k1,,w\:**W;QY@Y&Z[r&   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      b[  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        [        U SS 5      b[  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        g g ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GNp= f! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       g = f)NTr   r   r   r   r   r   )ra   rb   r6   rc   r   rR   rd   r   r   r   rO   r   r   re   s     r'   rd   TFCvtSelfAttention.build  s   ::
47>Jt@@EEF1177= G45t<Ht>>CCD//55d; E47>Jt@@EEF1177= G4+T2>t4499:%%++T4,HI ;4)40<t22778##))4t~~*FG 94+T2>t4499:%%++T4,HI ;: ? GF ED GF ;: 98 ;:sH   I+.I=
J&)J!)J38)K+
I:=
J
J!
J03
K
K)ra   r   r   r   rV   rO   r   r   r   r   r   r   T)rW   r   r   rj   rO   rj   rw   rj   r   rj   r   rj   r   rj   r   rj   r   r   r   rl   r   rC   r   rl   r   rk   
r^   rE   r   rj   r   rj   r>   rl   rm   rE   rD   )r   r    r!   r"   r#   r-   r   rA   rd   r%   rF   rG   s   @r'   r   r     s    $  $GAGA GA 	GA
 GA GA GA GA GA  #GA GA #GA GA GAR DJ Jr&   r   c                  H   ^  \ rS rSrSrSU 4S jjrSS	S jjrS
S jrSrU =r	$ )TFCvtSelfOutputi  zOutput of the Attention layer .c                   > [         TU ]  " S0 UD6  [        R                  R	                  U[        UR                  5      SS9U l        [        R                  R                  U5      U l	        X l
        g Ndense)r   rz   rR   r   )r,   r-   r   rT   r   r
   r   r   rU   rV   rO   )r/   rW   rO   	drop_rater0   r1   s        r'   r-   TFCvtSelfOutput.__init__  s^    "6"\\''@X@X0Y`g ( 

 ||++I6"r&   c                B    U R                  US9nU R                  XS9nU$ N)inputs)r   r>   r   rV   r   s      r'   rA   TFCvtSelfOutput.call  s(    zzz6||<|Kr&   c                ,   U R                   (       a  g SU l         [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        g g ! , (       d  f       g = fNTr   ra   rb   r6   rc   r   rR   rd   rO   re   s     r'   rd   TFCvtSelfOutput.build  g    ::
4$'3tzz/

  $dnn!=> 0/ 4//   )B
B)ra   r   rV   rO   )rW   r   rO   rj   r   rC   rk   r   rD   rn   rG   s   @r'   r   r     s    )#
? ?r&   r   c                     ^  \ rS rSrSr S                         S	U 4S jjjrS rS
SS jjrSS jrSr	U =r
$ )TFCvtAttentioni  zDAttention layer. First chunk of the convolutional transformer block.c                z   > [         TU ]  " S0 UD6  [        UUUUUUUUU	U
UUSS9U l        [	        XUSS9U l        g )N	attentionr   outputr   )r,   r-   r   r  r   dense_output)r/   rW   r   rO   rw   r   r   r   r   r   r   r   r   r   r0   r1   s                  r'   r-   TFCvtAttention.__init__  s[    " 	"6"+!
 ,FyxXr&   c                    [         erD   )NotImplementedError)r/   headss     r'   prune_headsTFCvtAttention.prune_heads  s    !!r&   c                D    U R                  XX4S9nU R                  XTS9nU$ r[   )r  r	  )r/   r^   r   r   r>   self_outputattention_outputs          r'   rA   TFCvtAttention.call  s.    nn\5nT,,[,Lr&   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       Nl= f! , (       d  f       g = f)NTr  r	  )ra   rb   r6   rc   r  rR   rd   r	  re   s     r'   rd   TFCvtAttention.build  s    ::
4d+7t~~223$$T* 44.:t00556!!''- 76 ; 43 76s   C.C%
C"%
C3)r  ra   r	  r   )rW   r   r   rj   rO   rj   rw   rj   r   rj   r   rj   r   rj   r   rj   r   r   r   rl   r   rC   r   rC   r   rl   rk   )r^   rE   r   rj   r   rj   r>   rl   rD   )r   r    r!   r"   r#   r-   r  rA   rd   r%   rF   rG   s   @r'   r  r    s    N   $!Y!Y !Y 	!Y
 !Y !Y !Y !Y !Y  #!Y !Y #!Y !Y !Y !YF" 
	. 	.r&   r  c                  D   ^  \ rS rSrSrSU 4S jjrSS jrS	S jrSrU =r	$ )
TFCvtIntermediatei  zNIntermediate dense layer. Second chunk of the convolutional transformer block.c                   > [         TU ]  " S0 UD6  [        R                  R	                  [        X#-  5      [        UR                  5      SSS9U l        X l	        g )Ngelur   )r   rz   
activationrR   r   )
r,   r-   r   rT   r   rj   r
   r   r   rO   )r/   rW   rO   	mlp_ratior0   r1   s        r'   r-   TFCvtIntermediate.__init__  sT    "6"\\''i+,.v/G/GH	 ( 

 #r&   c                (    U R                  U5      nU$ rD   )r   )r/   r^   s     r'   rA   TFCvtIntermediate.call   s    zz,/r&   c                ,   U R                   (       a  g SU l         [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        g g ! , (       d  f       g = fr   r   re   s     r'   rd   TFCvtIntermediate.build  r  r  )ra   r   rO   )rW   r   rO   rj   r  rj   r   rD   rn   rG   s   @r'   r  r    s    X#? ?r&   r  c                  H   ^  \ rS rSrSrSU 4S jjrSS	S jjrS
S jrSrU =r	$ )TFCvtOutputi  zm
Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
c                   > [         TU ]  " S0 UD6  [        R                  R	                  U[        UR                  5      SS9U l        [        R                  R                  U5      U l	        X l
        X0l        g r   )r,   r-   r   rT   r   r
   r   r   rU   rV   rO   r  )r/   rW   rO   r  r   r0   r1   s         r'   r-   TFCvtOutput.__init__  sc    "6"\\''@X@X0Y`g ( 

 ||++I6""r&   c                J    U R                  US9nU R                  XS9nX-   nU$ r   r   )r/   r^   input_tensorr>   s       r'   rA   TFCvtOutput.call  s0    zzz6||<|K#2r&   c           	     X   U R                   (       a  g SU l         [        U SS 5      br  [        R                  " U R                  R
                  5         U R                  R                  S S [        U R                  U R                  -  5      /5        S S S 5        g g ! , (       d  f       g = fr   )
ra   rb   r6   rc   r   rR   rd   rj   rO   r  re   s     r'   rd   TFCvtOutput.build!  su    ::
4$'3tzz/

  $c$..4>>2Q.R!ST 0/ 4//s   ?B
B))ra   r   rV   rO   r  )rW   r   rO   rj   r  rj   r   rj   rk   )r^   rE   r&  rE   r>   rl   rm   rE   rD   rn   rG   s   @r'   r"  r"    s    #U Ur&   r"  c                     ^  \ rS rSrSr S                             SU 4S jjjrS	S
S jjrSS jrSrU =r	$ )
TFCvtLayeri*  a  
Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
`Block` class in the original implementation.
c                  > [         TU ]  " S0 UD6  [        UUUUUUUUU	U
UUUSS9U l        [	        XUSS9U l        [        XXSS9U l        US:  a
  [        USS9O[        R                  R                  SSS9U l        [        R                  R                  SS	S
9U l        [        R                  R                  SSS
9U l        X0l        g )Nr  r   intermediater  r4   	drop_pathr   r{   layernorm_beforer}   layernorm_afterr   )r,   r-   r  r  r  r-  r"  r	  r)   r   rT   
Activationr.  r   r/  r0  rO   )r/   rW   r   rO   rw   r   r   r   r   r   r   r   r   r  drop_path_rater   r0   r1   s                    r'   r-   TFCvtLayer.__init__1  s    & 	"6"'!
  .fQ_`'9V^_ # .{;(((D 	 !& ? ?Se ? f$||>>tRc>d"r&   c                    U R                  U R                  U5      X#US9nU R                  XTS9nXQ-   nU R                  U5      nU R	                  U5      nU R                  Xa5      nU R                  XdS9nU$ r[   )r  r/  r.  r0  r-  r	  )r/   r^   r   r   r>   r  layer_outputs          r'   rA   TFCvtLayer.callb  s    >>$*?*?*Mvgo>p>>*:>N (6 ++L9((6 ((D~~l~Fr&   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      b[  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        g g ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GNc= f! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       g = f)NTr  r-  r	  r.  r/  r0  )ra   rb   r6   rc   r  rR   rd   r-  r	  r.  r/  rO   r0  re   s     r'   rd   TFCvtLayer.builds  s   ::
4d+7t~~223$$T* 44.:t00556!!''- 74.:t00556!!''- 74d+7t~~223$$T* 44+T2>t4499:%%++T4,HI ;4*D1=t33889$$**D$+GH :9 > 43 76 76 43 ;: :9sH   I.I0
J&J)J&+)J7
I-0
I?
J
J#&
J47
K)r  ra   r	  r.  rO   r-  r0  r/  r   )rW   r   r   rj   rO   rj   rw   rj   r   rj   r   rj   r   rj   r   rj   r   r   r   rl   r   rC   r   rC   r  rC   r2  rC   r   rl   rk   r   rD   rn   rG   s   @r'   r+  r+  *  s    ,  $!/#/# /# 	/#
 /# /# /# /# /#  #/# /# #/# /# /# /#  !/# /#b"I Ir&   r+  c                  H   ^  \ rS rSrSrSU 4S jjrSS	S jjrS
S jrSrU =r	$ )
TFCvtStagei  a+  
Cvt stage (encoder block). Each stage has 2 parts :
- (1) A Convolutional Token Embedding layer
- (2) A Convolutional Transformer Block (layer).
The classification token is added only in the last stage.

Args:
    config ([`CvtConfig`]): Model configuration class.
    stage (`int`): Stage number.
c                  > [         TU ]  " S0 UD6  Xl        X l        U R                  R                  U R                     (       aN  U R                  SSU R                  R                  S   4[        U R                  R                  5      SSS9U l        [        U R                  UR                  U R                     U R                  S:X  a  UR                  OUR                  U R                  S-
     UR                  U R                     UR                  U R                     UR                  U R                     UR                  U R                     SS9U l        [         R"                  " S	UR$                  U R                     UR&                  U   5      nU Vs/ s H   oUR)                  5       R+                  5       PM"     nn[-        UR&                  U R                     5       Vs/ s GHX  n[/        U4UR0                  U R                     UR                  U R                     UR2                  U R                     UR4                  U R                     UR6                  U R                     UR8                  U R                     UR:                  U R                     UR<                  U R                     UR>                  U R                     UR@                  U R                     UR                  U R                     URB                  U R                     X@R                     UR                  U R                     S
U 3S.6PGM[     snU l"        g s  snf s  snf )Nr   r   Tzcvt.encoder.stages.2.cls_token)r7   initializer	trainablerR   r   	embedding)rM   rN   rP   rO   rQ   rX   rR   r4   zlayers.)r   rO   rw   r   r   r   r   r   r   r   r   r  r2  r   rR   r   )#r,   r-   rW   stager   
add_weightrO   r
   r   rI   patch_sizesrN   patch_stridepatch_paddingr   r>  r6   linspacer2  depthnumpyitemranger+  r   
kernel_qkvr   r   r   r   r   r   r   r  rT   )r/   rW   r?  r0   drop_path_ratesr=   jr1   s          r'   r-   TFCvtStage.__init__  s   "6"
;;  ,!__!T[[22267+DKK,I,IJ5	 - DN )KK))$**504

a,,VEUEUVZV`V`cdVdEe&&tzz2&&tzz2((4))$**5	
 ++c6+@+@+Lfll[`Nab5DE_779>>+_E( 6<<

34'
& 5%  **4::6 **4::6"--djj94 **4::6 **4::6!,,TZZ8&,&B&B4::&N4$*$>$>tzz$J **4::6 **4::6.zz:%//

;qc]!$ 5'
 F
s   'M)E Mc                ,   S nU R                  X5      n[        U5      u  pEpgXV-  n[        R                  " XX4S9nU R                  R
                  U R                     (       a6  [        R                  " U R
                  USS9n[        R                  " X14SS9nU R                   H  n	U	" XXbS9n
U
nM     U R                  R
                  U R                     (       a  [        R                  " USXV-  /S5      u  p1[        R                  " XXVU4S9nX4$ )Nr   r   )repeatsr   r   r   r\   )r>  r   r6   r   rW   r   r?  repeatr   rT   r   )r/   r^   r>   r   r   r   r   rN   r   layerlayer_outputss              r'   rA   TFCvtStage.call  s    	~~l= 3=\2J/
Enzz,;6]^;;  ,		$..*1MI99i%>QGL[[E!,QM(L ! ;;  ,&(hh|a=PRS&T#I zz,6R^6_`&&r&   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  U R                   H=  n[        R                  " UR
                  5         UR                  S 5        S S S 5        M?     g g ! , (       d  f       Nk= f! , (       d  f       Mc  = f)NTr>  rT   )ra   rb   r6   rc   r>  rR   rd   rT   r/   rf   rP  s      r'   rd   TFCvtStage.build  s    ::
4d+7t~~223$$T* 444(4]]5::.KK% /. % 5 43 /.s   C3C$
C!$
C3	)ra   r   rW   r>  rT   r?  )rW   r   r?  rj   rk   )r^   rE   r>   rl   rD   rn   rG   s   @r'   r:  r:    s    	-
^'0
& 
&r&   r:  c                  d   ^  \ rS rSrSr\rSU 4S jjr   S         S	S jjrS
S jr	Sr
U =r$ )TFCvtEncoderi  z
Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
(depth) being 1, 2 and 10.

Args:
    config ([`CvtConfig`]): Model configuration class.
c           	        > [         TU ]  " S0 UD6  Xl        [        [	        UR
                  5      5       Vs/ s H  n[        XSU 3S9PM     snU l        g s  snf )Nzstages.r   r   )r,   r-   rW   rH  r8   rE  r:  stages)r/   rW   r0   	stage_idxr1   s       r'   r-   TFCvtEncoder.__init__  sY    "6"W\]`agamam]nWo
Wo)Jv/DEWo
 
s   Ac                Z   U(       a  SOS nUn[         R                  " USS9nS n[        U R                  5       H  u  pU	" XdS9u  pgU(       d  M  XV4-   nM     [         R                  " USS9nU(       a  [	        S U 5       5      nU(       d  [	        S XgU4 5       5      $ [        UUUS9$ )	Nr   )r   r   r   r   r   r\   r   r   r   r   c              3  L   #    U  H  n[         R                  " US S9v   M     g7f)r]  r   N)r6   r   ).0hss     r'   	<genexpr>$TFCvtEncoder.call.<locals>.<genexpr>  s     %fTebbll2L&ITes   "$c              3  .   #    U  H  oc  M  Uv   M     g 7frD   r   )r_  vs     r'   ra  rb    s     b$Pq$Ps   	r   r   r   )r6   r   	enumeraterY  tupler   )
r/   r]   output_hidden_statesreturn_dictr>   all_hidden_statesr^   r   r   stage_modules
             r'   rA   TFCvtEncoder.call  s     #7BD# ||L|D	!*4;;!7A&2<&S#L##$5$G! "8 ||L|D %%fTe%f fb\>O$Pbbb,*%+
 	
r&   c                   U R                   (       a  g SU l         [        U SS 5      bN  U R                   H=  n[        R                  " UR
                  5         UR                  S 5        S S S 5        M?     g g ! , (       d  f       MR  = f)NTrY  )ra   rb   rY  r6   rc   rR   rd   rT  s      r'   rd   TFCvtEncoder.build  s`    ::
44(4]]5::.KK% /. % 5..s   A77
B	)ra   rW   rY  rW   r   )FTF)
r]   r   rh  bool | Noneri  rp  r>   rp  rm   0TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]rD   )r   r    r!   r"   r#   r   config_classr-   rA   rd   r%   rF   rG   s   @r'   rW  rW    s_     L
 -2#' %
&
 *
 !	

 
 
:
B& &r&   rW  c                  p   ^  \ rS rSrSr\rSU 4S jjr\    S         S	S jj5       r	S
S jr
SrU =r$ )TFCvtMainLayeri(  zConstruct the Cvt model.c                P   > [         TU ]  " S0 UD6  Xl        [        USS9U l        g )Nencoderr   r   )r,   r-   rW   rW  rv  )r/   rW   r0   r1   s      r'   r-   TFCvtMainLayer.__init__.  s&    "6"#F;r&   c                    Uc  [        S5      eU R                  UUUUS9nUS   nU(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )N You have to specify pixel_valuesrh  ri  r>   r   r   re  )
ValueErrorrv  r   r   r   )r/   r]   rh  ri  r>   encoder_outputssequence_outputs          r'   rA   TFCvtMainLayer.call3  s{     ?@@,,!5#	 ' 
 *!,#%(;;;,-+;;)77
 	
r&   c                   U R                   (       a  g SU l         [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       g = f)NTrv  )ra   rb   r6   rc   rv  rR   rd   re   s     r'   rd   TFCvtMainLayer.buildP  s^    ::
4D)5t||001""4( 21 611ri   )ra   rW   rv  ro  NNNF)
r]   zTFModelInputType | Nonerh  rp  ri  rp  r>   rp  rm   rq  rD   )r   r    r!   r"   r#   r   rr  r-   r   rA   rd   r%   rF   rG   s   @r'   rt  rt  (  sl    "L<
  15,0#' %
-
 *
 !	

 
 
:
 
8) )r&   rt  c                  $    \ rS rSrSr\rSrSrSr	g)TFCvtPreTrainedModeliY  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
cvtr]   r   N)
r   r    r!   r"   r#   r   rr  base_model_prefixmain_input_namer%   r   r&   r'   r  r  Y  s    
 L$Or&   r  a  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TF 2.0 models accepts two formats as inputs:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.

    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
    tensors in the first argument of the model call function: `model(inputs)`.

    </Tip>

    Args:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
al  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                     ^  \ rS rSrSU 4S jjr\\" \5      \" \	\
S9    S         S	S jj5       5       5       rS
S jrSrU =r$ )
TFCvtModeli  c                L   > [         TU ]  " U/UQ70 UD6  [        USS9U l        g )Nr  r   )r,   r-   rt  r  r/   rW   r   r0   r1   s       r'   r-   TFCvtModel.__init__  s(    3&3F3!&u5r&   output_typerr  c                    Uc  [        S5      eU R                  UUUUS9nU(       d  US   4USS -   $ [        UR                  UR                  UR
                  S9$ )a'  
Returns:

Examples:

```python
>>> from transformers import AutoImageProcessor, TFCvtModel
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
>>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")

>>> inputs = image_processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```Nry  )r]   rh  ri  r>   r   r   re  )r{  r  r   r   r   r   )r/   r]   rh  ri  r>   outputss         r'   rA   TFCvtModel.call  sy    > ?@@((%!5#	  
 AJ=712;..,%77#33!//
 	
r&   c                   U R                   (       a  g SU l         [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       g = f)NTr  )ra   rb   r6   rc   r  rR   rd   re   s     r'   rd   TFCvtModel.build  sZ    ::
4%1txx}}-t$ .- 2--ri   )ra   r  ro  r  )
r]   r   rh  rp  ri  rp  r>   rp  rm   rq  rD   )r   r    r!   r"   r-   r   r   TFCVT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCrA   rd   r%   rF   rG   s   @r'   r  r    s    
6
 *+AB+HWfg *.,0#' %-
&-
 *-
 !	-

 -
 
:-
 h C -
^% %r&   r  z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                     ^  \ rS rSrSU 4S jjr\\" \5      \" \	\
S9     S           S	S jj5       5       5       rS
S jrSrU =r$ )TFCvtForImageClassificationi  c                F  > [         TU ]  " U/UQ70 UD6  UR                  U l        [        USS9U l        [
        R                  R                  SSS9U l        [
        R                  R                  UR                  [        UR                  5      SSSS	9U l        Xl        g )
Nr  r   r{   	layernormr}   Tr   
classifierr   )r,   r-   
num_labelsrt  r  r   rT   r   r  r   r
   r   r  rW   r  s       r'   r-   $TFCvtForImageClassification.__init__  s    3&3F3 ++!&u588K8X  ,,,,##.v/G/GH$ - 
 r&   r  c                   U R                  UUUUS9nUS   nUS   nU R                  R                  S   (       a  U R                  U5      nOM[	        U5      u  pp[
        R                  " XyXU-  4S9n[
        R                  " USS9nU R                  U5      n[
        R                  " USS9nU R                  U5      nUc  S	OU R                  X.S
9nU(       d  U4USS	 -   nUb  U4U-   $ U$ [        XUR                  S9$ )a{  
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Returns:

Examples:

```python
>>> from transformers import AutoImageProcessor, TFCvtForImageClassification
>>> import tensorflow as tf
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
>>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")

>>> inputs = image_processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # model predicts one of the 1000 ImageNet classes
>>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
>>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
```rz  r   r   r   r   )r   r   r   r   r   N)labelsr   r   )lossr   r   )r  rW   r   r  r   r6   r   r   reduce_meanr  hf_compute_lossr   r   )r/   r]   r  rh  ri  r>   r  r}  r   r   rN   r   r   sequence_output_meanr   r  r  s                    r'   rA    TFCvtForImageClassification.call  s   R ((!5#	  
 "!*AJ	;;  $"nnY7O 7A6Q3Jf jj\di[i@jkO ll?KO"nn_=O!~~oAF!56~t4+?+?v+?+]Y,F)-)9TGf$EvE54^e^s^sttr&   c                h   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bh  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  S   /5        S S S 5        [        U SS 5      b  [        U R                  S5      (       ai  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  S   /5        S S S 5        g g g ! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       g = f)NTr  r  r   r  rR   )ra   rb   r6   rc   r  rR   rd   r  rW   rO   hasattrr  re   s     r'   rd   !TFCvtForImageClassification.build7  s(   ::
4%1txx}}-t$ .4d+7t~~223$$dD$++2G2G2K%LM 44t,8t//]]4??#7#78OO))4t{{7L7LR7P*QR 98 0 9 .- 43 98s$   F .6F?6F# 
F
F #
F1)ra   r  rW   r  r  r  ro  )NNNNF)r]   r   r  r   rh  rp  ri  rp  r>   rp  rm   z9TFImageClassifierOutputWithNoAttention | tuple[tf.Tensor]rD   )r   r    r!   r"   r-   r   r   r  r   r   r  rA   rd   r%   rF   rG   s   @r'   r  r    s    $ *+AB+Q`op *.#',0#' %@u&@u !@u *	@u
 !@u @u 
C@u q C @uDS Sr&   r  )r  r  r  ):r#   
__future__r   collections.abcr   dataclassesr   
tensorflowr6   modeling_tf_outputsr   modeling_tf_utilsr   r   r	   r
   r   r   r   tf_utilsr   r   utilsr   r   r   r   r   configuration_cvtr   
get_loggerr   loggerr  r   rT   Layerr)   rI   rS   r   r   r   r   r   r  r  r"  r+  r:  rW  rt  r  TFCVT_START_DOCSTRINGr  r  r  __all__r   r&   r'   <module>r     se    "  !  I   3  ) 
		H	%  7K 7 7(/ELL&& /(%8ell(( %8P7G%,,,, 7Gt"Mu||'9'9 "MJ);); 85<<#5#5 8DMJ++ MJ`?ell(( ?27.U\\'' 7.t?** ?4U%,,$$ U:^I## ^IB]&## ]&@:&5<<%% :&z -)U\\'' -) -)`%, % 8 & c>%% >%	>%B  eS"68T eSeSP Pr&   