
    6bi!                     ^    S r SSKJs  Jr  SSKJr  SSKJr  \" S5       " S S\5      5       r	g)zAttention layer that can be used in sequence DNN/CNN models.

This file follows the terminology of https://arxiv.org/abs/1706.03762 Figure 2.
Attention is formed by three tensors: Query, Key and Value.
    N)BaseDenseAttention)keras_exportzkeras.layers.Attentionc                   N   ^  \ rS rSrSrSU 4S jjrU 4S jrS rU 4S jrSr	U =r
$ )		Attention   a  Dot-product attention layer, a.k.a. Luong-style attention.

Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor
of shape `[batch_size, Tv, dim]` and `key` tensor of shape
`[batch_size, Tv, dim]`. The calculation follows the steps:

1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
    product: `scores = tf.matmul(query, key, transpose_b=True)`.
2. Use scores to calculate a distribution with shape
    `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
3. Use `distribution` to create a linear combination of `value` with
     shape `[batch_size, Tq, dim]`:
     `return tf.matmul(distribution, value)`.

Args:
    use_scale: If `True`, will create a scalar variable to scale the
        attention scores.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
        attention scores. Defaults to 0.0.
    score_mode: Function to use to compute attention scores, one of
        `{"dot", "concat"}`. `"dot"` refers to the dot product between the
        query and key vectors. `"concat"` refers to the hyperbolic tangent
        of the concatenation of the query and key vectors.

Call arguments:
    inputs: List of the following tensors:
        * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
        * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
        * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If
            not given, will use `value` for both `key` and `value`, which is
            the most common case.
    mask: List of the following tensors:
        * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
            If given, the output will be zero at the positions where
            `mask==False`.
        * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
            If given, will apply the mask such that values at positions
             where `mask==False` do not contribute to the result.
    return_attention_scores: bool, it `True`, returns the attention scores
        (after masking and softmax) as an additional output argument.
    training: Python boolean indicating whether the layer should behave in
        training mode (adding dropout) or in inference mode (no dropout).
    use_causal_mask: Boolean. Set to `True` for decoder self-attention. Adds
        a mask such that position `i` cannot attend to positions `j > i`.
        This prevents the flow of information from the future towards the
        past.
        Defaults to `False`.

Output:

    Attention outputs of shape `[batch_size, Tq, dim]`.
    [Optional] Attention scores after masking and softmax with shape
        `[batch_size, Tq, Tv]`.

The meaning of `query`, `value` and `key` depend on the application. In the
case of text similarity, for example, `query` is the sequence embeddings of
the first piece of text and `value` is the sequence embeddings of the second
piece of text. `key` is usually the same tensor as `value`.

Here is a code example for using `Attention` in a CNN+Attention network:

```python
# Variable-length int sequences.
query_input = tf.keras.Input(shape=(None,), dtype='int32')
value_input = tf.keras.Input(shape=(None,), dtype='int32')

# Embedding lookup.
token_embedding = tf.keras.layers.Embedding(input_dim=1000, output_dim=64)
# Query embeddings of shape [batch_size, Tq, dimension].
query_embeddings = token_embedding(query_input)
# Value embeddings of shape [batch_size, Tv, dimension].
value_embeddings = token_embedding(value_input)

# CNN layer.
cnn_layer = tf.keras.layers.Conv1D(
    filters=100,
    kernel_size=4,
    # Use 'same' padding so outputs have the same shape as inputs.
    padding='same')
# Query encoding of shape [batch_size, Tq, filters].
query_seq_encoding = cnn_layer(query_embeddings)
# Value encoding of shape [batch_size, Tv, filters].
value_seq_encoding = cnn_layer(value_embeddings)

# Query-value attention of shape [batch_size, Tq, filters].
query_value_attention_seq = tf.keras.layers.Attention()(
    [query_seq_encoding, value_seq_encoding])

# Reduce over the sequence axis to produce encodings of shape
# [batch_size, filters].
query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    query_seq_encoding)
query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
    query_value_attention_seq)

# Concatenate query and document encodings to produce a DNN input layer.
input_layer = tf.keras.layers.Concatenate()(
    [query_encoding, query_value_attention])

# Add DNN layers, and create Model.
# ...
```
c                 |   > [         TU ]  " S0 UD6  Xl        X l        U R                  S;  a  [	        SU S35      eg )N)dotconcatzReceived: score_mode=z*. Acceptable values are: ["dot", "concat"] )super__init__	use_scale
score_mode
ValueError)selfr   r   kwargs	__class__s       a/home/james-whalen/.local/lib/python3.13/site-packages/tf_keras/src/layers/attention/attention.pyr   Attention.__init__   sM    "6""$??"33'
| 4) )  4    c                   > U R                   (       a#  U R                  SSSU R                  SS9U l        OSU l        U R                  S:X  a#  U R                  SSSU R                  SS9U l        OSU l        [        TU ]  U5        g)	zFCreates variable when `use_scale` is True or `score_mode` is
`concat`.scaler   onesT)nameshapeinitializerdtype	trainableNr
   concat_score_weight)r   
add_weightr   r   r   r   r   build)r   input_shaper   s     r   r!   Attention.build   s     >>"jj ) DJ DJ??h&'+*"jj (7 (D$ (,D$k"r   c                    U R                   S:X  a2  [        R                  " XSS9nU R                  b  X0R                  -  nU$ U R                   S:X  a  [        R                  " USS9n[        R                  " USS9nU R                  bG  U R
                  [        R                  " [        R                  " U R                  XE-   -  5      SS9-  nU$ U R
                  [        R                  " [        R                  " XE-   5      SS9-  nW$ )	zCalculates attention scores as a query-key dot product.

Args:
    query: Query tensor of shape `[batch_size, Tq, dim]`.
    key: Key tensor of shape `[batch_size, Tv, dim]`.
Returns:
    Tensor of shape `[batch_size, Tq, Tv]`.
r	   T)transpose_br
   )axis)r   tfmatmulr   expand_dimsr   
reduce_sumtanh)r   querykeyscores
q_reshaped
k_reshapeds         r   _calculate_scoresAttention._calculate_scores   s     ??e#YYut<Fzz%**$   __( B7J"5Jzz%11BMMGGDJJ**ABC"5  	 11BMMGGJ3425  r   c                    > U R                   U R                  S.n[        TU ]  5       n[	        [        UR                  5       5      [        UR                  5       5      -   5      $ )N)r   r   )r   r   r   
get_configdictlistitems)r   configbase_configr   s      r   r7   Attention.get_config   sK    #~~T__Mg(*D**,-V\\^0DDEEr   )r   r   r   r   )Fr	   )__name__
__module____qualname____firstlineno____doc__r   r!   r4   r7   __static_attributes____classcell__)r   s   @r   r   r      s%    fP#2<F Fr   r   )
rB   tensorflow.compat.v2compatv2r*   2tf_keras.src.layers.attention.base_dense_attentionr    tensorflow.python.util.tf_exportr   r   r   r   r   <module>rJ      sB    " ! Q : &'mF" mF (mFr   