
    6bi=                     ^    S r SSKJs  Jr  SSKJr  SSKJr  \" S5       " S S\5      5       r	g)zAdditive attention layer that can be used in sequence DNN/CNN models.

This file follows the terminology of https://arxiv.org/abs/1706.03762 Figure 2.
Attention is formed by three tensors: Query, Key and Value.
    N)BaseDenseAttention)keras_exportzkeras.layers.AdditiveAttentionc                   N   ^  \ rS rSrSrSU 4S jjrU 4S jrS rU 4S jrSr	U =r
$ )	AdditiveAttention   a  Additive attention layer, a.k.a. Bahdanau-style attention.

Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor
of shape `[batch_size, Tv, dim]` and `key` tensor of shape
`[batch_size, Tv, dim]`. The calculation follows the steps:

1. Reshape `query` and `key` into shapes `[batch_size, Tq, 1, dim]`
    and `[batch_size, 1, Tv, dim]` respectively.
2. Calculate scores with shape `[batch_size, Tq, Tv]` as a non-linear
    sum: `scores = tf.reduce_sum(tf.tanh(query + key), axis=-1)`
3. Use scores to calculate a distribution with shape
    `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
4. Use `distribution` to create a linear combination of `value` with
    shape `[batch_size, Tq, dim]`:
   `return tf.matmul(distribution, value)`.

Args:
    use_scale: If `True`, will create a variable to scale the attention
        scores.
    dropout: Float between 0 and 1. Fraction of the units to drop for the
        attention scores. Defaults to `0.0`.

Call arguments:
    inputs: List of the following tensors:
        * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
        * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
        * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`.
            If not given, will use `value` for both `key` and `value`,
            which is the most common case.
    mask: List of the following tensors:
        * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
            If given, the output will be zero at the positions where
            `mask==False`.
        * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
            If given, will apply the mask such that values at positions
            where `mask==False` do not contribute to the result.
    training: Python boolean indicating whether the layer should behave in
        training mode (adding dropout) or in inference mode (no dropout).
    return_attention_scores: bool, it `True`, returns the attention scores
        (after masking and softmax) as an additional output argument.
    use_causal_mask: Boolean. Set to `True` for decoder self-attention. Adds
        a mask such that position `i` cannot attend to positions `j > i`.
        This prevents the flow of information from the future towards the
        past. Defaults to `False`.

Output:

    Attention outputs of shape `[batch_size, Tq, dim]`.
    [Optional] Attention scores after masking and softmax with shape
        `[batch_size, Tq, Tv]`.

The meaning of `query`, `value` and `key` depend on the application. In the
case of text similarity, for example, `query` is the sequence embeddings of
the first piece of text and `value` is the sequence embeddings of the second
piece of text. `key` is usually the same tensor as `value`.

Here is a code example for using `AdditiveAttention` in a CNN+Attention
network:

```python
# Variable-length int sequences.
query_input = tf.keras.Input(shape=(None,), dtype='int32')
value_input = tf.keras.Input(shape=(None,), dtype='int32')

# Embedding lookup.
token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
# Query embeddings of shape [batch_size, Tq, dimension].
query_embeddings = token_embedding(query_input)
# Value embeddings of shape [batch_size, Tv, dimension].
value_embeddings = token_embedding(value_input)

# CNN layer.
cnn_layer = tf.keras.layers.Conv1D(
    filters=100,
    kernel_size=4,
    # Use 'same' padding so outputs have the same shape as inputs.
    padding='same')
# Query encoding of shape [batch_size, Tq, filters].
query_seq_encoding = cnn_layer(query_embeddings)
# Value encoding of shape [batch_size, Tv, filters].
value_seq_encoding = cnn_layer(value_embeddings)

# Query-value attention of shape [batch_size, Tq, filters].
query_value_attention_seq = tf.keras.layers.AdditiveAttention()(
    [query_seq_encoding, value_seq_encoding])

# Reduce over the sequence axis to produce encodings of shape
# [batch_size, filters].
query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    query_seq_encoding)
query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
    query_value_attention_seq)

# Concatenate query and document encodings to produce a DNN input layer.
input_layer = tf.keras.layers.Concatenate()(
    [query_encoding, query_value_attention])

# Add DNN layers, and create Model.
# ...
```
c                 2   > [         TU ]  " S0 UD6  Xl        g )N )super__init__	use_scale)selfr   kwargs	__class__s      j/home/james-whalen/.local/lib/python3.13/site-packages/tf_keras/src/layers/attention/additive_attention.pyr   AdditiveAttention.__init__   s    "6""    c                   > [         R                  " US   5      nUS   n[         R                  R                  U5      nU R                  (       a$  U R                  SU/SU R                  SS9U l        OS U l        [        TU ]%  U5        g )N   scaleglorot_uniformT)nameshapeinitializerdtype	trainable)
tfTensorShapecompatdimension_valuer   
add_weightr   r   r
   build)r   input_shapev_shapedimr   s       r   r"   AdditiveAttention.build   sx    ..Q0bkii'',>>e,jj ) DJ DJk"r   c                     [         R                  " USS9n[         R                  " USS9nU R                  (       a  U R                  nOSn[         R                  " U[         R
                  " X4-   5      -  SS9$ )zCalculates attention scores as a nonlinear sum of query and key.

Args:
    query: Query tensor of shape `[batch_size, Tq, dim]`.
    key: Key tensor of shape `[batch_size, Tv, dim]`.
Returns:
    Tensor of shape `[batch_size, Tq, Tv]`.
)axisg      ?r   )r   expand_dimsr   r   
reduce_sumtanh)r   querykey
q_reshaped
k_reshapedr   s         r   _calculate_scores#AdditiveAttention._calculate_scores   s[     ^^E3
^^Cb1
>>JJEE}}URWWZ-D%EEBOOr   c                    > SU R                   0n[        TU ]	  5       n[        [	        UR                  5       5      [	        UR                  5       5      -   5      $ )Nr   )r   r
   
get_configdictlistitems)r   configbase_configr   s      r   r5   AdditiveAttention.get_config   sG    t~~.g(*D**,-V\\^0DDEEr   )r   r   )T)__name__
__module____qualname____firstlineno____doc__r   r"   r2   r5   __static_attributes____classcell__)r   s   @r   r   r      s&    dL## P(F Fr   r   )
r@   tensorflow.compat.v2r   v2r   2tf_keras.src.layers.attention.base_dense_attentionr    tensorflow.python.util.tf_exportr   r   r	   r   r   <module>rG      sB    " ! Q : ./RF* RF 0RFr   