
    6biT                        S r SSKrSSKrSSKJs  Jr  SSKJ	r	  SSK
Jr  SSK
Jr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  \R0                  r\R2                  r\R4                  r\R6                  r\R8                  rSrSr " S S\R>                  R@                  5      r! " S S\RD                  5      r# " S S\RH                  5      r%g)z'Keras index lookup preprocessing layer.    N)backend)base_layer_utils)base_preprocessing_layer)preprocessing_utils)layer_serialization)layer_utils)tf_utils)
tf_loggingvocabidf_weightsc                   D    \ rS rSrSrS r\S 5       r\S 5       rS r	Sr
g)	NullInitializer,   zEA placeholder initializer for restoring this layer from a SavedModel.c                     Xl         X l        g)z{Construct a table initializer object.

Args:
  key_dtype: Type of the table keys.
  value_dtype: Type of the table values.
N
_key_dtype_value_dtype)self	key_dtypevalue_dtypes      h/home/james-whalen/.local/lib/python3.13/site-packages/tf_keras/src/layers/preprocessing/index_lookup.py__init__NullInitializer.__init__/   s     $'    c                     U R                   $ )zThe expected table key dtype.)r   r   s    r   r   NullInitializer.key_dtype9   s     r   c                     U R                   $ )zThe expected table value dtype.)r   r   s    r   r   NullInitializer.value_dtype>   s        r   c                     g)z$Returns the table initialization op.N )r   tables     r   
initializeNullInitializer.initializeC   s    r   r   N)__name__
__module____qualname____firstlineno____doc__r   propertyr   r   r#   __static_attributes__r!   r   r   r   r   ,   s7    O(   ! !r   r   c                   :    \ rS rSrSrS r\S 5       rS rS r	Sr
g)	VocabWeightHandlerH   z;Adds the vocabulary as a layer weight during serialization.c                 x    Xl         UR                  U l        [        R                  R                  5       U l        g N)_layervocabulary_dtype_dtypetf
distributeget_strategy_distribute_strategy)r   lookup_layers     r   r   VocabWeightHandler.__init__K   s+    
 #"33$&MM$>$>$@!r   c                     g)N   r!   r   s    r   num_tensorsVocabWeightHandler.num_tensorsT   s    r   c                     [         R                  " US   U R                  5      nU R                  R	                  U5      U R                  l        g Nr   )r4   convert_to_tensorr3   r1   _lookup_table_from_tokenslookup_table)r   weightstokenss      r   set_weightsVocabWeightHandler.set_weightsX   s7    %%gaj$++>#';;#H#H#P r   c                 z    U R                   R                  SS9n[        R                  " XR                  5      nU/$ )NFinclude_special_tokens)r1   get_vocabularyr4   r@   r3   )r   rD   s     r   get_tensorsVocabWeightHandler.get_tensors\   s4    ++5+I%%fkk:xr   )r7   r3   r1   N)r%   r&   r'   r(   r)   r   r*   r<   rE   rK   r+   r!   r   r   r-   r-   H   s*    EA  Qr   r-   c                     ^  \ rS rSrSr      S$U 4S jjrS rS rS%S jrS r	S r
U 4S	 jrS
 rS&S jrS rS rS rS rS rS rS rS rS rS rS rS rS rS rS rS rS rS rS r S r!S  r"\#S! 5       r$S" r%S#r&U =r'$ )'IndexLookupc   a  Maps values from a vocabulary to integer indices.

This layer translates a set of arbitrary hashables into an integer output
via a table-based lookup, with optional out-of-vocabulary handling. This is
the basis layer for both IntegerLookup and StringLookup; it holds the common
logic but is not intended to be exported as part of the TF-Keras API.

Args:
  max_tokens: The maximum size of the vocabulary for this layer. If None,
    there is no cap on the size of the vocabulary. Note that this size
    includes the OOV and mask tokens.
  num_oov_indices: The number of out-of-vocabulary tokens to use. If this
    value is more than 1, OOV inputs are hashed to determine their OOV
    value. If this value is 0, OOV inputs will cause an error when calling
    the layer.
  mask_token: A token that represents masked inputs. When `output_mode` is
    `"int"`, the token is included in vocabulary and mapped to index 0. In
    other output modes, the token will not appear in the vocabulary and
    instances of the mask token in the input will be dropped. If set to
    None, no mask term will be added.
  oov_token: Only used when `invert` is True. The token to return for OOV
    indices.
  vocabulary: Optional. Either an array or a string path to a text file. If
    passing an array, can pass a tuple, list, 1D numpy array, or 1D tensor
    containing the vocbulary terms. If passing a file path, the file should
    contain one line per term in the vocabulary. If this argument is set,
    there is no need to `adapt` the layer.
  vocabulary_dtype: The dtype of the vocabulary terms. For example,
    `"int64"` or `"string"`.
  idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list,
    1D numpy array, or 1D tensor or the same length as the vocabulary,
    containing the floating point inverse document frequency weights, which
    will be multiplied by per sample term counts for the final `tf_idf`
    weight. If the `vocabulary` argument is set, and `output_mode` is
    `"tf_idf"`, this argument must be supplied.
  invert: Only valid when `output_mode` is `"int"`. If True, this layer will
    map indices to vocabulary items instead of mapping vocabulary items to
    indices. Defaults to `False`.
  output_mode: Specification for the output of the layer. Values can be
    `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
    configuring the layer as follows:
      - `"int"`: Return the raw integer indices of the input tokens.
      - `"one_hot"`: Encodes each individual element in the input into an
        array the same size as the vocabulary, containing a 1 at the element
        index. If the last dimension is size 1, will encode on that
        dimension.  If the last dimension is not size 1, will append a new
        dimension for the encoded output.
      - `"multi_hot"`: Encodes each sample in the input into a single array
        the same size as the vocabulary, containing a 1 for each vocabulary
        term present in the sample. Treats the last dimension as the sample
        dimension, if input shape is (..., sample_length), output shape will
        be (..., num_tokens).
      - `"count"`: As `"multi_hot"`, but the int array contains a count of
        the number of times the token at that index appeared in the sample.
      - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
        find the value in each token slot.
    Defaults to `"int"`.
  pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
    `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
    padded to `max_tokens` even if the number of unique tokens in the
    vocabulary is less than max_tokens, resulting in a tensor of shape
    [batch_size, max_tokens] regardless of vocabulary size. Defaults to
    False.
  sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`, `"count"`
    and `"tf-idf"` output modes. If True, returns a `SparseTensor` instead
    of a dense `Tensor`. Defaults to `False`.
c                 0
  > Ub  US::  a  [        SU 35      eU(       a  Uc  [        SU 35      eUS:  a  [        SU 35      eU	S:X  a  [        n	U	S:X  a  [        n	[        R                  " U	[
        [        [        [        [        4U R                  R                  SS	9  U(       a  U	[
        :w  a  [        S
U	 35      eU
(       a  U	[
        :X  a  [        SU
 SU	 35      eUb  U	[        :w  a  [        SU SU	 35      eXl
        Xl        X l        X0l        X@l        Xl        Xl        Xl        XPl        UR'                  SS 5      U l        X`l        Xpl        UR'                  SUS L5      U l        UR'                  SS 5        SU;  a2  U	[
        :X  a  [0        R2                  O[4        R6                  " 5       US'   [8        TU ]t  " S0 UD6  U	[
        :X  aB  [0        R<                  " U R>                  5      R@                  (       d  US   n[        SU 35      eU(       af  U	[
        :X  a  U RB                  O[0        R2                  U l"        [0        R<                  " U R$                  5      U l#        SnUnU R                  U l$        O[0        R<                  " U R$                  5      U l"        U	[
        :X  a  U RB                  O[0        R2                  U l#        UnU R                  [
        :X  a  SOU RF                  RJ                  nU R                  S:X  a  SU l$        O-U R                  S:X  a  U RM                  5       U l$        OSU l$        U R                  bJ  [0        RN                  " XRD                  5      U l(        [0        RN                  " XRF                  5      U l)        U R                  [        :X  aW  [0        RT                  " S/U RW                  5       -  SU R>                  SS9U l,        U RX                  R[                  5       U l.        Ub  U R_                  Xg5        OU Ra                  5       U l1        U R.                  (       d  U Re                  [g        U 5      S5        [0        Rh                  Rj                  Rm                  U[0        R2                  SS9U l7        U R                  [        :X  af  [0        Rh                  Rj                  Rm                  U[0        R2                  SS9U l8        [0        RT                  " S[0        R2                  SS9U l9        g g g )Nr;   zBIf set, `max_tokens` must be greater than 1. Received: max_tokens=zJIf pad_to_max_tokens is True, must set `max_tokens`. Received: max_tokens=r   zP`num_oov_indices` must be greater than or equal to 0. Received: num_oov_indices=binaryztf-idfoutput_mode)allowable_strings
layer_namearg_namezK`output_mode` must be `'int'` when `invert` is true. Received: output_mode=zt`sparse` may only be true if `output_mode` is `'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. Received: sparse=z and output_mode=zW`idf_weights` should only be set if `output_mode` is `'tf_idf'`. Received: idf_weights=vocabulary_sizehas_input_vocabularyhas_static_tabledtypezMWhen `output_mode='int'`, `dtype` should be an integer type. Received: dtype=r0   F)shaperY   	trainable)r   r   default_value)rY   r\   r!   ):
ValueError	MULTI_HOTTF_IDFr   validate_string_argINTONE_HOTCOUNT	__class__r%   invert
max_tokensnum_oov_indices
mask_token	oov_tokenrR   sparsepad_to_max_tokensr2   pop_frozen_vocab_sizeinput_vocabularyinput_idf_weights_has_input_vocabularyr4   int64r   floatxsuperr   as_dtypecompute_dtype
is_integerrY   r   r   _default_valuemax_oov_start_indexr@   	_mask_key_mask_valueVariable_token_start_indexr   valueidf_weights_constset_vocabulary_uninitialized_lookup_tablerB   _add_trackabler-   lookupexperimentalMutableHashTabletoken_countstoken_document_countsnum_documents)r   rg   rh   ri   rj   r2   
vocabularyr   rf   rR   rk   rl   kwargsinput_dtypemask_key
mask_valuere   s                   r   r   IndexLookup.__init__   s   " !jAo((2|5 
 !3((2|5 
 Q--<,=?  ("#K(" K''"GYvF~~.."		
 kS())47 
 kS($$*8 ,*m-  "{f'<55@M B*m-  $.$"&!2 0"(**->"E *!,
 &,ZZ"Zt%;&
"
 	

%t, & '3.GNN4D 7O 	"6" 3KK 2 23>> /K))47 
 ,73,>djjBHHDO "D,A,A BDH#J"&..D kk$*?*?@DO.9S.@

bhhD!H #..#54;L;L;P;PJ##q( ')#%%* '+&;&;&=# ')#??&11(OOLDN!33-- D v%!{{d--//((	 D &*%5%5%;%;%=D"!
8 !% @ @ BD )) 24 8%@ "		 6 6 G G*HH !H !D
 6)II**;;"2$&HH&' <  * &([[RXX&" * *r   c                     U R                   [        :X  a  U$ U R                  (       a  U R                  OU R                  n[
        R                  " US   U/5      $ r?   )rR   rb   rl   rg   rn   r4   TensorShape)r   input_shapedepths      r   compute_output_shape IndexLookup.compute_output_shapea  sR    s" %% OO(( 	
 ~~{1~u566r   c                     U R                  UR                  R                  5       5      nU R                  (       a  U R                  OU R
                  n[        R                  " X#S9$ )N)r[   rY   )r   r[   as_listrf   r2   rv   r4   
TensorSpec)r   
input_specoutput_shapeoutput_dtypes       r   compute_output_signature$IndexLookup.compute_output_signaturek  sN    001A1A1I1I1KL%)[[D!!d6H6H 	 }}<DDr   c                 2  ^  T R                   R                  5       S:X  a  / / p2OUT R                   R                  5       u  pET R                  (       a  XT4OXE4u  p#T R	                  U5      UR                  5       p2[        R                  " U 4S j[        X25      5      n[        T R                  5       5       Vs/ s H  ovU   PM	     nnT R                  b#  T R                  [        :X  a  T R                  US'   U(       d  UT R                  5       S nU$ s  snf )aA  Returns the current vocabulary of the layer.

Args:
  include_special_tokens: If True, the returned vocabulary will include
    mask and OOV tokens, and a term's index in the vocabulary will equal
    the term's index when calling the layer. If False, the returned
    vocabulary will not include any mask or OOV tokens.
r   c                     > T R                   $ r0   )rj   r   s   r   <lambda>,IndexLookup.get_vocabulary.<locals>.<lambda>  s	    DNNr   N)rB   sizeexportrf   _tensor_vocab_to_numpynumpycollectionsdefaultdictziprangerV   ri   rR   rb   r~   )r   rI   r   indiceskeysvaluesr   xs   `       r   rJ   IndexLookup.get_vocabularyr  s     !!#q(7,,335LD/3{{f^NE++E2  (("C$7
 %*$*>*>*@$AB$Aq$AB??&4+;+;s+BE!H%$11356E Cs   8Dc                    [         R                  " 5       (       aB  [        U R                  R	                  5       R                  5       5      U R                  5       -   $ U R                  R	                  5       U R                  5       -   $ )zGets the current size of the layer's vocabulary.

Returns:
  The integer size of the vocabulary, including optional mask and oov
  indices.
)r4   executing_eagerlyintrB   r   r   r~   r   s    r   rV   IndexLookup.vocabulary_size  sn     !!D%%**,2245))+,
 $$))+d.E.E.GGGr   c                 N    [         R                  " S5        U R                  5       $ )Nz5vocab_size is deprecated, please use vocabulary_size.)loggingwarningrV   r   s    r   
vocab_sizeIndexLookup.vocab_size  s    OP##%%r   c                   > U R                   U R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  [        R                  " U R                  5      [        R                  " U R                  5      U R                  S.n[        TU ]=  5       n[!        [#        UR%                  5       5      [#        UR%                  5       5      -   5      $ )N)rf   rg   rh   rj   ri   rR   rk   rl   r2   r   r   rV   )rf   rg   rh   rj   ri   rR   rk   rl   r2   utilslistify_tensorsrp   ro   rn   rt   
get_configdictlistitems)r   configbase_configre   s      r   r   IndexLookup.get_config  s    kk//#33//++kk!%!7!7 $ 5 5 001G1GH//0E0EF#66
 g(*D**,-V\\^0DDEEr   c                     U R                  5         [        R                  " 5          U R                  5       U l        S S S 5        g ! , (       d  f       g = fr0   )_ensure_vocab_size_unchangedr4   
init_scoperV   rn   r   s    r   _record_vocabulary_size#IndexLookup._record_vocabulary_size  s2    ))+]]_&*&:&:&<D# __s   A
Ac                 B   U R                   [        :X  a  Uc  [        S5      eOUb  [        SU R                    SU 35      e[        U[        5      (       a  [
        R                  R                  R                  U5      (       d  [        SU S35      eU R                   [        :X  a  [        S5      eU R                  U5      U l
        U R                  5         g[
        R                  " 5       (       do  [
        R                  " U5      (       d  [
        R                  " U5      (       a9  [        SR                  U R                   R"                  U R$                  5      5      e[
        R                  " U5      (       a  U R'                  U5      nO1[        U[(        [*        45      (       a  [,        R.                  " U5      n[
        R                  " U5      (       a  UR1                  5       nO1[        U[(        [*        45      (       a  [,        R.                  " U5      nUR2                  S	:X  a  [        S
U S35      eU R5                  5       nU R7                  5       nU R8                  /U-  U R:                  /U R<                  -  -   n[,        R>                  " XQSU 5      nU(       a  XS nOUnU RA                  U5      nU(       a  [        SR                  U5      5      eU R8                  bS  U R8                  U;   aC  [,        RB                  " XR8                  :H  5      S   n	[        SU SU R8                   SU	 35      eU R:                  bd  U RD                  (       aS  U R:                  U;   aC  [,        RB                  " XR:                  :H  5      S   n
[        SU SU R:                   SU
 35      eU[G        U5      -   nU RH                  b3  XRH                  :  a$  [        SR                  XRH                  5      5      eU RK                  U5      U l
        U R                  5         U R                   [        :X  Gae  USLGa^  [G        U5      [G        U5      :w  a-  [        SR                  [G        U5      [G        U5      5      5      eU RM                  U5      nURN                  S:w  a#  [        SR                  [Q        U5      5      5      eU(       a  S	nS	nOUn[,        RR                  " U5      nS	nU RT                  (       a)  U RH                  b  U RH                  U-
  [G        U5      -
  nOS	n[,        RV                  " UX4SX4S9n[
        RX                  " UU RZ                  S9nU R\                  R_                  U5        U R\                  Ra                  5       U l1        ggg)a  Sets vocabulary (and optionally document frequency) for this layer.

This method sets the vocabulary and idf weights for this layer directly,
instead of analyzing a dataset through `adapt`. It should be used
whenever the vocab (and optionally document frequency) information is
already known.  If vocabulary data is already present in the layer, this
method will replace it.

Args:
  vocabulary: Either an array or a string path to a text file. If
    passing an array, can pass a tuple, list, 1D numpy array, or 1D
    tensor containing the vocbulary terms. If passing a file path, the
    file should contain one line per term in the vocabulary.
  idf_weights: A tuple, list, 1D numpy array, or 1D tensor of inverse
    document frequency weights with equal length to vocabulary. Must be
    set if `output_mode` is `"tf_idf"`. Should not be set otherwise.

Raises:
  ValueError: If there are too many inputs, the inputs do not match, or
    input data is missing.
  RuntimeError: If the vocabulary cannot be set when this function is
    called. This happens when `"multi_hot"`, `"count"`, and `"tf_idf"`
    modes, if `pad_to_max_tokens` is False and the layer itself has
    already been called.
  RuntimeError: If a tensor vocabulary is passed outside of eager
    execution.
Nz2`idf_weights` must be set if output_mode is TF_IDFzU`idf_weights` should only be set if output_mode is `'tf_idf'`. Received: output_mode=z and idf_weights=zVocabulary file z does not exist.zGoutput_mode `'tf_idf'` does not support loading a vocabulary from file.zCannot set a tensor vocabulary on {} layer {} when not executing eagerly. Create this layer or call `set_vocabulary` outside of any `tf.function`s and with eager execution enabled.r   z+Cannot set an empty vocabulary, you passed .zmThe passed vocabulary has at least one repeated term. Please uniquify your dataset. The repeated terms are {}rZ   a  Found reserved mask token at unexpected location in `vocabulary`. Note that passed `vocabulary` does not need to include the OOV and mask tokens. Either remove all mask and OOV tokens, or include them only at the start of the vocabulary in precisely this order: z. Received: mask_token=z at vocabulary index a  Found reserved OOV token at unexpected location in `vocabulary`. Note that passed `vocabulary` does not need to include the OOV and mask tokens. Either remove all mask and OOV tokens, or include them only at the start of the vocabulary in precisely this order: z. Received: oov_token=zpAttempted to set a vocabulary larger than the maximum vocab size. Passed vocab size is {}, max vocab size is {}.Fzb`idf_weights` must be the same length as vocabulary. len(idf_weights) is {}, len(vocabulary) is {}r;   z4TF-IDF data must be a 1-index array, but received {}constantconstant_valuesrY   )2rR   r`   r^   
isinstancestrr4   iogfileexists_lookup_table_from_filerB   r   r   	is_tensorRuntimeErrorformatre   r%   namer   r   tuplenparrayr   r   rz   r~   ri   rj   rh   array_equal_find_repeated_tokensargwhererf   lenrg   rA   _convert_to_ndarrayndimtypeaveragerl   padr@   rv   r   assignr   r   )r   r   r   	oov_starttoken_startspecial_tokensfound_special_tokensrD   repeated_tokens
mask_index	oov_indexnew_vocab_sizefront_paddingfront_padding_valueback_padding_valueback_paddingrC   s                    r   r   IndexLookup.set_vocabulary  s;   8 v%" H  # $5595E5E4F G##.-1  j#&&55;;%%j11 &zl2BC  6) ,  !% < <Z HD((*##%%LL$$[(A(A "6$.."9"9499E	  <<
##44Z@J
T5M22*-J<<$$%++-KdE]33((;/K??a=j\K  ))+	--///*Y6NN:
  :! !  "~~|4 
  -FF44V<0  ??&4??f+DZ??%BCBGJ7 8F6F G((,'8 9$$.<1  NN&&(J..$@A"EI7 8F6F G''+~~&6 7$$-;0  %s6{2??&N__,LGGMv"OOH  !::6B$$&v%+U*B:#k"22 DDJFJ[)9E  22;?K1$ &&,fT+->&?  $ !&'# +&(jj&=# "#%%$//*EOOm3c+6FF   !ff-!4 I	G **7$:L:LMG##G,%)%5%5%;%;%=D"W +C%r   c                    U R                   (       a.  [        SR                  U R                  R                  5      5      e[
        R                  " XR                  S9nUR                  R                  S:X  a  [        R                  " US5      nUR                  R                  S:X  a  [        R                  " US5      nU R                  U5      u  p#U R                  R                  X#U R                  R                  U5      -   5        U R                   ["        :X  a  [        R$                  " S U5      nU R                  U5      u  p%U R&                  R                  X%U R&                  R                  U5      -   5        [(        R*                  " U5      (       a*  U R,                  R/                  UR1                  5       5        g U R,                  R/                  [        R                  " U[        R2                  S9S   5        g g )Nz^Cannot adapt {} layer after setting a static vocabulary via init argument or `set_vocabulary`.r   r   r;   c                 4    [         R                  " U 5      S   $ r?   )r4   unique)r   s    r   r   *IndexLookup.update_state.<locals>.<lambda>  s    299Q<?r   )out_type)rq   r^   r   re   r%   r   ensure_tensorr2   r[   rankr4   expand_dims_num_tokensr   insertr   rR   r`   map_fnr   r	   	is_raggedr   
assign_addnrowsrr   )r   datarD   countsdeduped_doc_data
doc_countss         r   update_stateIndexLookup.update_statev  s   %%''-vdnn.E.E'F  ""4/D/DE::??a>>$*D::??a >>$*D))$/  T..55f==	
 v%!yy)BDI!%!1!12B!CF&&--T%?%?%F%Fv%NN !!$''""--djjl;""--HHTBHH5a8 &r   c                 |   U R                   (       d4  [        R                  " U R                  R	                  5       S5      (       aD  U R
                  [        :X  a  U R                  R                  5       U l	        U R                  5         g U R                  bE  U R                  R                  [        R                  " U R                  /U R                  5      5        U R                  bE  U R                  R                  [        R                  " U R                  /U R                  5      5        U R                  R!                  5       u  p["        R$                  " UR'                  5       UR'                  5       45      S S S2   nU R)                  5       nU R*                  (       a  U R*                  U-
  nUS U n[        R,                  " X5      nU R/                  U5      U l        U R
                  [        :X  Ga#  U R2                  R5                  U5      nU R7                  X`R8                  5      n[        R:                  " XpR<                  5      n[        R>                  " UU R)                  5       S//[        R@                  " U5      S9nU RB                  (       aG  U R*                  b:  [        R>                  " USU R*                  [        R                  " U5      -
  //SS9nU R                  RE                  U5        U R                  R                  5       U l	        U RG                  5         U R                  5         g )Nr   rZ   r   )$rq   r4   equalr   r   rR   r`   r   r   r   r   ri   remover@   r2   rj   r   r   lexsortr   r~   rg   gatherrA   rB   r   r   _inverse_document_frequencyr   castrv   r   reduce_meanrl   r   reset_state)r   rD   r   sorted_indicesr   max_learned_tokensr   r   s           r   finalize_stateIndexLookup.finalize_state  s   %%$2C2C2H2H2JA)N)N 6))-)9)9)?)?)A&((* ??&$$$$doo%68M8MN >>%$$$$dnn%5t7L7LM **113 V\\^V\\^$DEddK--/??!%;!>+,?-?@N62 ::6Bv%$($>$>$E$Ef$M!::%'9'9K ''+/A/ABK &&))+Q/0 "{ ;K
 %%$//*E ff277;+??@A$%
 ##K0%)%5%5%;%;%=D"
 	$$&r   c                 `   U R                   (       a  g U R                  R                  U R                  R                  5       S   5        U R                  [
        :X  aR  U R                  R                  U R                  R                  5       S   5        U R                  R                  S5        g g r?   )	rq   r   r  r   rR   r`   r   r   r   r   s    r   r  IndexLookup.reset_state  s    %%  !2!2!9!9!;A!>?v%&&--**113A6 %%a(	 &r   c           	         U R                  5         [        R                  " XR                  S9nUR                  nUR                  R
                  S:X  a  U R                  US5      n[        R                  " U5      (       aF  [        R                  " UR                  U R                  UR                  5      UR                  5      nOW[        R                  " U5      (       a+  [        R                   R#                  U R                  U5      nOU R                  U5      nU R$                  [&        :X  a)  UR
                  S:X  a  [        R(                  " US5      nU$ U R*                  (       a  U R,                  OU R.                  nU R$                  [0        :X  a  U R2                  OS n[        R4                  " UU R$                  UU R6                  U R8                  US9$ )Nr   r   rZ   )rR   r   rY   rk   r   )_ensure_known_vocab_sizer   r   r   r[   r   _expand_dimsr	   	is_sparser4   SparseTensorr   _lookup_denser   dense_shaper   raggedmap_flat_valuesrR   rb   squeezerl   rg   rn   r`   r   encode_categorical_inputsrv   rk   )r   inputsoriginal_shapelookupsr   r   s         r   callIndexLookup.call  sx   %%'$$V??C<<!&&vr2Ff%%oo""6==1""G
 ''ii//0B0BFKG((0Gs"""a'**Wb1N %% OO(( 	 '+&6&6&&@D""d 	 ..(($$;;#
 	
r   c                 H   [         R                  " 5       (       a:  [        R                  " U5      (       a  [         R                  " XR
                  S9nOU R                  R                  U5      nU R                  bA  [         R                  " XR                  5      n[         R                  " X0R                  U5      nU R                  (       a  U$ / nU R                  S:X  a  [         R                  " [         R                  " US5      5      n[         R                  " X5      n[         R                   R#                  SU45      n[         R$                  " [         R                  " [         R&                  " U5      S5      U/5      nUR)                  U5        OU R                  S:  a  U R*                  R,                  (       a*  [         R.                  R1                  XR                  5      nO'[         R                   R3                  XR                  S9nXPR5                  5       -   n[         R                  " X R6                  5      n	[         R                  " XU5      n[         R8                  " U5         [         R:                  " U5      sSSS5        $ ! , (       d  f       g= f)zALookup table values for a dense Tensor, handling masking and OOV.r   Nr   rZ   zwWhen `num_oov_indices=0` all inputs should be in vocabulary, found OOV values {}, consider setting `num_oov_indices=1`.r;   )num_buckets)r4   r   r   is_keras_tensor
zeros_liker   rB   r   ri   r  r{   wherer|   rf   rh   	gather_ndstringsr   Assertr   appendr   rw   mathfloormodto_hash_bucket_fastrz   rx   control_dependenciesidentity)
r   r  r!  mask_locationslookup_checksoov_indices
oov_inputsmsg	assertionoov_locationss
             r   r  IndexLookup._lookup_dense  s    !!g&=&=f&E&EmmF2C2CDG''..v6G??&XXfnn=Nhh~/?/?IG;;N1$((288GR#89Kf:J**##MC
 		"((277;+?"CcUKI  +!!A%)) gg..v7K7KL jj<<(<(< =  &(=(=(??KHHW.A.ABMhh}7CG$$]3;;w' 433s   3J
J!c                 h    U R                   [        :X  a  U R                  R                  5       US'   g g Nr   )rR   r`   r   r   r   stores     r   save_own_variablesIndexLookup.save_own_variables=  s.    v%#'#9#9#?#?#AE-  &r   c                     U R                   [        :X  a>  U R                  R                  US   5        U R                  R	                  5       U l        g g r;  )rR   r`   r   r   r   r   r<  s     r   load_own_variablesIndexLookup.load_own_variablesA  sE    v%##E-$89%)%5%5%;%;%=D" &r   c                 d   U R                   (       a  g U R                  SS9n[        R                  R                  R                  US5      n[        US5       nUR                  SR                  U Vs/ s H  n[        U5      PM     sn5      5        S S S 5        g s  snf ! , (       d  f       g = f)NTrH   vocabulary.txtw
)	ro   rJ   r4   r   r   joinopenwriter   )r   dir_pathr   vocabulary_filepathfrE  s         r   save_assetsIndexLookup.save_assetsF  s       (((E
 eekk..x9IJ%s+qGGDIIz:z!s1vz:;< ,+: ,+s   B!3BB!B!!
B/c                 X   U R                   (       a  g [        R                  R                  R	                  US5      n[        US5       nUR                  5       R                  S5      n[        R                  " U R                  5      [        R                  :X  a  U Vs/ s H  n[        U5      PM     nnOU Vs/ s H  n[        U5      PM     nnU R                  [        :X  a  U R                  USS9  OU R                  U5        S S S 5        g s  snf s  snf ! , (       d  f       g = f)NrD  rrF  F)r   )ro   r4   r   r   rG  rH  readsplitru   r2   stringr   r   rR   r`   r   )r   rJ  rK  rL  linesliner   s          r   load_assetsIndexLookup.load_assetsP  s        eekk..x9IJ%s+qFFHNN4(E{{4001RYY>056#d)6056#d)66)##F#>##F+ ,+ 76 ,+s+   	ADD4D;D8D
D
D)c                     [         R                  " 5          [        U R                  U R                  5      n[         R
                  R                  XR                  5      sS S S 5        $ ! , (       d  f       g = fr0   )r4   r   r   r   r   r   StaticHashTablerx   )r   initializers     r   r   'IndexLookup._uninitialized_lookup_tableb  sC    ]]_)$//4;L;LMK99,,[:M:MN __s   A	A))
A7c                    [         R                  " 5          U R                  5       nU[         R                  " U5      -   nU R                  (       a  U R
                  OU R                  n[         R                  " X#US9nU R                  (       a  XQ4OX4u  pg[         R                  R                  XgU R
                  U R                  5      n[         R                  R                  XR                  5      sS S S 5        $ ! , (       d  f       g = f)Nr   )r4   r   r~   r   rf   r   r   r   r   KeyValueTensorInitializerrY  rx   )	r   rD   r   	token_endindices_dtyper   r   r   rZ  s	            r   rA   %IndexLookup._lookup_table_from_tokensg  s    ]]_113K#bggfo5I#';;D4E4E  hh{]KG%)[[!v6G D ))==doot/@/@K 99,,[:M:MN __s   CC>>
Dc           
         U R                   (       aI  [        R                  R                  R                  n[        R                  R                  R
                  nOH[        R                  R                  R
                  n[        R                  R                  R                  n[        R                  " 5          [        R                  R                  UU R                  UU R                  UU R                  5       S9n[        R                  R                  X@R                  5      sS S S 5        $ ! , (       d  f       g = f)N)filenamer   	key_indexr   value_indexvalue_index_offset)rf   r4   r   TextFileIndexLINE_NUMBER
WHOLE_LINEr   TextFileInitializerr   r   r~   rY  rx   )r   rb  rc  rd  rZ  s        r   r   #IndexLookup._lookup_table_from_filew  s    ;;		//;;I))11<<K		//::I))11==K]]_))77!//# --'#'#:#:#< 8 K 99,,[:M:MN __s   8A-D//
D=c                 h    [        U[        [        45      (       a  [        R                  " U5      $ U$ r0   )r   r   r   r   r   )r   r   s     r   r   IndexLookup._convert_to_ndarray  s%    (T5M::rxx{AAr   c                     [         R                  " U5      (       a  [        R                  R	                  X5      $ [        R                  " X5      $ r0   )r	   r  r4   rk   r   )r   r  axiss      r   r  IndexLookup._expand_dims  s7    f%%99((66>>&//r   c                 L    U R                   b  U R                  [        :X  a  S$ S$ )Nr;   r   )ri   rR   rb   r   s    r   rz   IndexLookup._oov_start_index  s(    ,1A1AS1HA	
NO	
r   c                 <    U R                  5       U R                  -   $ r0   )rz   rh   r   s    r   r~   IndexLookup._token_start_index  s    $$&)=)===r   c                     U R                   [        :X  d  U R                  (       a  g U R                  c2  [	        SU R                    S3R                  U R                   5      5      eg )NWhen using `output_mode=z` and `pad_to_max_tokens=False`, you must set the layer's vocabulary before calling it. Either pass a `vocabulary` argument to the layer, or call `adapt` with some sample data.)rR   rb   rl   rn   r   r   r   s    r   r  $IndexLookup._ensure_known_vocab_size  sc    s"d&<&<""**4+;+;*< =) ) *00@0@)A  +r   c                 Z   U R                   [        :X  d  U R                  (       a  g [        R                  " 5          U R                  5       nS S S 5        U R                  b9  WU R                  :w  a(  [        SU R                    SU R                   SU 35      eg g ! , (       d  f       NU= f)Nru  zt` and `pad_to_max_tokens=False`, the vocabulary size cannot be changed after the layer is called. Old vocab size is z, new vocab size is )rR   rb   rl   r4   r   rV   rn   r   )r   r   s     r   r   (IndexLookup._ensure_vocab_size_unchanged  s    s"d&<&<]]_!113N  ##/$"9"99*4+;+;*< =- .2-D-D,E F%%3$4	6  : 0	 _s   B
B*c                     [        U5      n[        U5      [        U5      :w  aA  [        R                  " U5      R	                  5        VVs/ s H  u  p4US:  d  M  UPM     snn$ / $ s  snnf )z+Return all repeated tokens in a vocabulary.r;   )setr   r   Counterr   )r   r   vocabulary_setitemcounts        r   r   !IndexLookup._find_repeated_tokens  sg    Zz?c.11 $/#6#6z#B#H#H#J#JKD19 #J  Is   A'A'c                 $   [         R                  " U5      (       a  UR                  nO@[         R                  " U5      (       a  UR                  nO[
        R                  " US/5      n[
        R                  " U[
        R                  S9u  p4nX54$ )z?Count the number of tokens in a ragged, sparse or dense tensor.rZ   )out_idx)	r	   r  r   r   flat_valuesr4   reshapeunique_with_countsrr   )r   r   r  rD   _r   s         r   r   IndexLookup._num_tokens  sm    d##++K%%**K**TB40K11+rxxP6~r   c                 R    [         R                  R                  SUSU-   -  -   5      $ )a~  Computes the inverse-document-frequency (IDF) component of "tf_idf".

Uses the default weighting scheme described in
https://en.wikipedia.org/wiki/Tf%E2%80%93idf.

Args:
  token_document_counts: An array of the # of documents each token
    appears in.
  num_documents: An int representing the total number of documents

Returns:
  An array of "inverse document frequency" weights.
r;   )r4   r-  log)r   r   r   s      r   r
  'IndexLookup._inverse_document_frequency  s'     ww{{1}4I0IJJKKr   c                 .    [         R                  " U 5      $ r0   )r   VocabularySavedModelSaverr   s    r   _trackable_saved_model_saver(IndexLookup._trackable_saved_model_saver  s    "<<TBBr   c                 "    UR                  5       $ )z3Converts a tensor vocabulary to a numpy vocabulary.)r   )r   r   s     r   r   "IndexLookup._tensor_vocab_to_numpy  s    !!r   )rx   rn   rq   r   r{   r|   r   r   r   rp   ro   rf   rB   ri   rg   r   rh   rj   rR   rl   rk   r   r   r2   )NNFr   FF)Tr0   )(r%   r&   r'   r(   r)   r   r   r   rJ   rV   r   r   r   r   r  r  r  r"  r  r>  rA  rM  rV  r   rA   r   r   r  rz   r~   r  r   r   r   r
  r*   r  r   r+   __classcell__)re   s   @r   rN   rN   c   s    BV wr7E>H&F$=
y>v"H;'z	))
V.(`B>
=,$O
O O$B0

>
&
	L  C C" "r   rN   )&r)   r   r   r   tensorflow.compat.v2compatv2r4   tf_keras.srcr   tf_keras.src.enginer   r   !tf_keras.src.layers.preprocessingr   r   &tf_keras.src.saving.legacy.saved_modelr   tf_keras.src.utilsr   r	   tensorflow.python.platformr
   r   rb   r_   rc   rd   r`   _VOCAB_NAME_IDF_WEIGHTS_NAMEr   r]  r   TrackableWeightHandlerr-   PreprocessingLayerrN   r!   r   r   <module>r     s    .   ! !   0 8 J F * ' =iiOO	
--	! bii99 8)@@ 6D"*== D"r   