
    6biC6                         S r SSKrSSKrSSKrSSKJr  SSKJr  S r	\" S5       " S S\R                  5      5       r\" S	5      SS
 j5       r\" S5            SS j5       rg)ai  Utilities for preprocessing sequence data.

Deprecated: `tf.keras.preprocessing.sequence` APIs are not recommended for new
code. Prefer `tf.keras.utils.timeseries_dataset_from_array` and
the `tf.data` APIs which provide a much more flexible mechanisms for dealing
with sequences. See the [tf.data guide](https://www.tensorflow.org/guide/data)
for more details.
    N)
data_utils)keras_exportc                     / / pC[        X5       H8  u  pV[        U5      U :  d  M  UR                  U5        UR                  U5        M:     X44$ )a'  Removes sequences that exceed the maximum length.

Args:
    maxlen: Int, maximum length of the output sequences.
    seq: List of lists, where each sublist is a sequence.
    label: List where each element is an integer.

Returns:
    new_seq, new_label: shortened lists for `seq` and `label`.
)ziplenappend)maxlenseqlabelnew_seq	new_labelxys          ]/home/james-whalen/.local/lib/python3.13/site-packages/tf_keras/src/preprocessing/sequence.py_remove_long_seqr   $   sK     RYCq6F?NN1Q       z0keras.preprocessing.sequence.TimeseriesGeneratorc                   H    \ rS rSrSr       S
S jrS rS rS rS r	S	r
g)TimeseriesGenerator7   a8  Utility class for generating batches of temporal data.

Deprecated: `tf.keras.preprocessing.sequence.TimeseriesGenerator` does not
operate on tensors and is not recommended for new code. Prefer using a
`tf.data.Dataset` which provides a more efficient and flexible mechanism for
batching, shuffling, and windowing input. See the
[tf.data guide](https://www.tensorflow.org/guide/data) for more details.

This class takes in a sequence of data-points gathered at
equal intervals, along with time series parameters such as
stride, length of history, etc., to produce batches for
training/validation.

Arguments:
    data: Indexable generator (such as list or Numpy array)
        containing consecutive data points (timesteps).
        The data should be at 2D, and axis 0 is expected
        to be the time dimension.
    targets: Targets corresponding to timesteps in `data`.
        It should have same length as `data`.
    length: Length of the output sequences (in number of timesteps).
    sampling_rate: Period between successive individual timesteps
        within sequences. For rate `r`, timesteps
        `data[i]`, `data[i-r]`, ... `data[i - length]`
        are used for create a sample sequence.
    stride: Period between successive output sequences.
        For stride `s`, consecutive output samples would
        be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
    start_index: Data points earlier than `start_index` will not be used
        in the output sequences. This is useful to reserve part of the
        data for test or validation.
    end_index: Data points later than `end_index` will not be used
        in the output sequences. This is useful to reserve part of the
        data for test or validation.
    shuffle: Whether to shuffle output samples,
        or instead draw them in chronological order.
    reverse: Boolean: if `true`, timesteps in each output sample will be
        in reverse chronological order.
    batch_size: Number of timeseries samples in each batch
        (except maybe the last one).

Returns:
    A [Sequence](
    https://www.tensorflow.org/api_docs/python/tf/tf_keras/utils/Sequence)
    instance.

Examples:
    ```python
    from tf_keras.src.preprocessing.sequence import TimeseriesGenerator
    import numpy as np
    data = np.array([[i] for i in range(50)])
    targets = np.array([[i] for i in range(50)])
    data_gen = TimeseriesGenerator(data, targets,
                                   length=10, sampling_rate=2,
                                   batch_size=2)
    assert len(data_gen) == 20
    batch_0 = data_gen[0]
    x, y = batch_0
    assert np.array_equal(x,
                          np.array([[[0], [2], [4], [6], [8]],
                                    [[1], [3], [5], [7], [9]]]))
    assert np.array_equal(y,
                          np.array([[10], [11]]))
    ```
Nc                    [        U5      [        U5      :w  a)  [        SS[        U5       3-   S[        U5       3-   5      eXl        X l        X0l        X@l        XPl        Xc-   U l        Uc  [        U5      S-
  nXpl        Xl	        Xl
        Xl        U R                  U R                  :  a$  [        SU R                  U R                  4-  5      eg )NzData and targets have to bez  of same length. Data length is z while target length is    zz`start_index+length=%i > end_index=%i` is disallowed, as no part of the sequence would be left to be used as current step.)r   
ValueErrordatatargetslengthsampling_ratestridestart_index	end_indexshufflereverse
batch_size)selfr   r   r   r   r   r   r   r    r!   r"   s              r   __init__TimeseriesGenerator.__init__{   s     t9G$-4SYK@A,S\N;<  	*&/D	AI"$dnn,< ##T^^45  -r   c                     U R                   U R                  -
  U R                  U R                  -  -   U R                  U R                  -  -  $ )N)r   r   r"   r   )r#   s    r   __len__TimeseriesGenerator.__len__   s@    NNT---$++0MMoo+- 	-r   c           	         U R                   (       aA  [        R                  R                  U R                  U R
                  S-   U R                  S9nO{U R                  U R                  U R                  -  U-  -   n[        R                  " U[        X0R                  U R                  -  -   U R
                  S-   5      U R                  5      n[        R                  " U Vs/ s H*  nU R                  X@R                  -
  X@R                  2   PM,     sn5      n[        R                  " U Vs/ s H  o@R                  U   PM     sn5      nU R                  (       a  US S 2S S S2S4   U4$ XV4$ s  snf s  snf )Nr   )size.)r    nprandomrandintr   r   r"   r   arangeminarrayr   r   r   r   r!   )r#   indexrowsirowsamplesr   s          r   __getitem__TimeseriesGenerator.__getitem__   s=   <<99$$  $..1"44?? % D   4??T[[#@5#HHA99A$++55t~~7IJD ((  C 		#+c4F4FFG
 ((>#LL->?<<1ddC<('11
 ?s   "1E0.E5c                    U R                   n[        U R                   5      R                  [        R                  :X  a  U R                   R                  5       n [        R                  " U5      nU R                  n[        U R                  5      R                  [        R                  :X  a  U R                  R                  5       n [        R                  " U5      nUUU R                  U R                  U R                  U R                  U R                  U R                  U R                   U R"                  S.
$ ! [         a  n[        SU5      UeSnAff = f! [         a  n[        SU5      UeSnAff = f)zReturns the TimeseriesGenerator configuration as Python dictionary.

Returns:
    A Python dictionary with the TimeseriesGenerator configuration.
zData not JSON Serializable:NzTargets not JSON Serializable:)
r   r   r   r   r   r   r   r    r!   r"   )r   type
__module__r,   __name__tolistjsondumps	TypeErrorr   r   r   r   r   r   r    r!   r"   )r#   r   	json_dataer   json_targetss         r   
get_configTimeseriesGenerator.get_config   s.    yy		?%%499##%D	H

4(I ,,((BKK7ll))+G	N::g.L
 #kk!//kk++||||//
 	
  	H94@aG	H  	N<gFAM	Ns0   D: E :
EEE
E5#E00E5c                     U R                  5       nU R                  R                  US.n[        R                  " U40 UD6$ )zReturns a JSON string containing the generator's configuration.

Args:
    **kwargs: Additional keyword arguments to be passed
        to `json.dumps()`.

Returns:
    A JSON string containing the tokenizer configuration.
)
class_nameconfig)rD   	__class__r<   r>   r?   )r#   kwargsrH   timeseries_generator_configs       r   to_jsonTimeseriesGenerator.to_json   s?     "..11'
# zz5@@@r   )
r"   r   r   r   r!   r   r    r   r   r   )r   r   r   NFF   )r<   r;   __qualname____firstlineno____doc__r$   r'   r7   rD   rL   __static_attributes__ r   r   r   r   7   s<    @N 'R-
 2!
FAr   r   z0keras.preprocessing.sequence.make_sampling_tablec                     Sn[         R                  " U 5      nSUS'   U[         R                  " U5      U-   -  S-   SSU-  -  -
  nX-  n[         R                  " SU[         R                  " U5      -  5      $ )a  Generates a word rank-based probabilistic sampling table.

Used for generating the `sampling_table` argument for `skipgrams`.
`sampling_table[i]` is the probability of sampling
the word i-th most common word in a dataset
(more common words should be sampled less frequently, for balance).

The sampling probabilities are generated according
to the sampling distribution used in word2vec:

```
p(word) = (min(1, sqrt(word_frequency / sampling_factor) /
    (word_frequency / sampling_factor)))
```

We assume that the word frequencies follow Zipf's law (s=1) to derive
a numerical approximation of frequency(rank):

`frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
where `gamma` is the Euler-Mascheroni constant.

Args:
    size: Int, number of possible words to sample.
    sampling_factor: The sampling factor in the word2vec formula.

Returns:
    A 1D Numpy array of length `size` where the ith entry
    is the probability that a word of rank i should be sampled.
gX9v?r   r   g      ?      ?g      (@)r,   r/   logminimumsqrt)r*   sampling_factorgammarankinv_fqfs         r   make_sampling_tabler^      sn    > E99T?DDGRVVD\E)*S03$+3FFF A::c1rwwqz>**r   z&keras.preprocessing.sequence.skipgramsc           
         / n/ n	[        U 5       H  u  pU(       d  M  Ub  Xk   [        R                  " 5       :  a  M.  [        SX-
  5      n[        [	        U 5      X-   S-   5      n[        X5       HV  nX:w  d  M
  X   nU(       d  M  UR                  X/5        U(       a  U	R                  SS/5        ME  U	R                  S5        MX     M     US:  a  [        [	        U	5      U-  5      nU Vs/ s H  nUS   PM
     nn[        R                  " U5        U[        U5       V
s/ s H.  n
UU
[	        U5      -     [        R                  " SUS-
  5      /PM0     sn
-  nU(       a  U	SS//U-  -  n	O	U	S/U-  -  n	U(       ar  Uc  [        R                  " SS5      n[        R                  " U5        [        R                  " U5        [        R                  " U5        [        R                  " U	5        X4$ s  snf s  sn
f )a  Generates skipgram word pairs.

This function transforms a sequence of word indexes (list of integers)
into tuples of words of the form:

- (word, word in the same window), with label 1 (positive samples).
- (word, random word from the vocabulary), with label 0 (negative samples).

Read more about Skipgram in this gnomic paper by Mikolov et al.:
[Efficient Estimation of Word Representations in
Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)

Args:
    sequence: A word sequence (sentence), encoded as a list
        of word indices (integers). If using a `sampling_table`,
        word indices are expected to match the rank
        of the words in a reference dataset (e.g. 10 would encode
        the 10-th most frequently occurring token).
        Note that index 0 is expected to be a non-word and will be skipped.
    vocabulary_size: Int, maximum possible word index + 1
    window_size: Int, size of sampling windows (technically half-window).
        The window of a word `w_i` will be
        `[i - window_size, i + window_size+1]`.
    negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
        1 for same number as positive samples.
    shuffle: Whether to shuffle the word couples before returning them.
    categorical: bool. if False, labels will be
        integers (eg. `[0, 1, 1 .. ]`),
        if `True`, labels will be categorical, e.g.
        `[[1,0],[0,1],[0,1] .. ]`.
    sampling_table: 1D array of size `vocabulary_size` where the entry i
        encodes the probability to sample a word of rank i.
    seed: Random seed.

Returns:
    couples, labels: where `couples` are int pairs and
        `labels` are either 0 or 1.

Note:
    By convention, index 0 in the vocabulary is
    a non-word and will be skipped.
r   r   g    cA)	enumerater-   maxr0   r   ranger   intr    r.   seed)sequencevocabulary_sizewindow_sizenegative_samplesr    categoricalsampling_tablerd   coupleslabelsr4   wiwindow_start
window_endjwjnum_negative_samplescwordss                      r   	skipgramsru     s   j GF8$%!FMMO31ao.X!(;<
|0Av[x(MM1a&)MM!$ 1 %( !"3v;1A#AB&'w!1w'u/0
0 1s5z>"FNN1o6I$JK0
 	
 1vh!555Fqc000F<>>!T*DDwDv?) (
s   0G7%5G<)gh㈵>)   rU   TFNN)rQ   r>   r-   numpyr,   tf_keras.src.utilsr    tensorflow.python.util.tf_exportr   r   Sequencer   r^   ru   rS   r   r   <module>r{      s       ) :& @A|A*-- |A B|A~ @A$+ B$+N 67 	` 8`r   