Skip to content

Commit

Permalink
Merge pull request #48 from philipperemy/sequential
Browse files Browse the repository at this point in the history
add sequential examples + keras layer
  • Loading branch information
philipperemy authored Sep 3, 2020
2 parents 0309dbf + 75aac86 commit 5fa5345
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 70 deletions.
41 changes: 36 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,37 @@
[![license](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/philipperemy/keras-attention-mechanism/blob/master/LICENSE) [![dep1](https://img.shields.io/badge/Tensorflow-2.0+-brightgreen.svg)](https://www.tensorflow.org/) [![dep2](https://img.shields.io/badge/Keras-2.0+-brightgreen.svg)](https://keras.io/)
![Simple Keras Attention CI](https://github.com/philipperemy/keras-attention-mechanism/workflows/Simple%20Keras%20Attention%20CI/badge.svg)

```
pip install attention
```

Many-to-one attention mechanism for Keras.

<p align="center">
<img src="examples/equations.png">
<img src="examples/equations.png" width="600">
</p>


Installation via pip

```bash
pip install attention
```

Import in the source code

```python
from attention import Attention

# [...]

m = Sequential([
LSTM(128, input_shape=(seq_length, 1), return_sequences=True),
Attention(name='attention_weight'), # <--------- here.
Dense(1, activation='linear')
])
```

## Examples

Install the requirements before running the examples: `pip install -r requirements.txt`.

### IMDB Dataset

In this experiment, we demonstrate that using attention yields a higher accuracy on the IMDB dataset. We consider two
Expand Down Expand Up @@ -46,6 +65,18 @@ task and the attention map converges to the ground truth.
<img src="examples/attention.gif" width="320">
</p>

### Finding max of a sequence

We consider many 1D sequences of the same length. The task is to find the maximum of each sequence.

We give the full sequence processed by the RNN layer to the attention layer. We expect the attention layer to focus on the maximum of each sequence.

After a few epochs, the attention layer converges perfectly to what we expected.

<p align="center">
<img src="examples/readme/example.png" width="320">
</p>

## References

- https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf
Expand Down
4 changes: 3 additions & 1 deletion attention/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from attention.attention import attention_3d_block # noqa
from attention.attention import Attention # noqa

VERSION = '3.0'
52 changes: 29 additions & 23 deletions attention/attention.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,32 @@
from tensorflow.keras.layers import Dense, Lambda, dot, Activation, concatenate
from tensorflow.keras.layers import Layer


def attention_3d_block(hidden_states):
"""
Many-to-one attention mechanism for Keras.
@param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
@return: 2D tensor with shape (batch_size, 128)
@author: felixhao28.
"""
hidden_size = int(hidden_states.shape[2])
# Inside dense layer
# hidden_states dot W => score_first_part
# (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
# W is the trainable weight matrix of attention Luong's multiplicative style score
score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states)
# score_first_part dot last_hidden_state => attention_weights
# (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps)
h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states)
score = dot([score_first_part, h_t], [2, 1], name='attention_score')
attention_weights = Activation('softmax', name='attention_weight')(score)
# (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector')
pre_activation = concatenate([context_vector, h_t], name='attention_output')
attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation)
return attention_vector
class Attention(Layer):

def __init__(self, **kwargs):
super().__init__(**kwargs)

def __call__(self, hidden_states):
"""
Many-to-one attention mechanism for Keras.
@param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
@return: 2D tensor with shape (batch_size, 128)
@author: felixhao28.
"""
hidden_size = int(hidden_states.shape[2])
# Inside dense layer
# hidden_states dot W => score_first_part
# (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
# W is the trainable weight matrix of attention Luong's multiplicative style score
score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states)
# score_first_part dot last_hidden_state => attention_weights
# (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps)
h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states)
score = dot([score_first_part, h_t], [2, 1], name='attention_score')
attention_weights = Activation('softmax', name='attention_weight')(score)
# (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector')
pre_activation = concatenate([context_vector, h_t], name='attention_output')
attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation)
return attention_vector
24 changes: 10 additions & 14 deletions examples/example-attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
import numpy
import numpy as np
from keract import get_activations
from tensorflow.keras import Input
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, Dropout, LSTM

from attention import attention_3d_block
from attention import Attention


def task_add_two_numbers_after_delimiter(n: int, seq_length: int, delimiter: float = 0.0,
Expand Down Expand Up @@ -59,14 +56,13 @@ def main():
x_test_mask[:, test_index_1:test_index_1 + 1] = 1
x_test_mask[:, test_index_2:test_index_2 + 1] = 1

# model
i = Input(shape=(seq_length, 1))
x = LSTM(100, return_sequences=True)(i)
x = attention_3d_block(x)
x = Dropout(0.2)(x)
x = Dense(1, activation='linear')(x)
model = Sequential([
LSTM(100, input_shape=(seq_length, 1), return_sequences=True),
Attention(name='attention_weight'),
Dropout(0.2),
Dense(1, activation='linear')
])

model = Model(inputs=[i], outputs=[x])
model.compile(loss='mse', optimizer='adam')
print(model.summary())

Expand All @@ -79,7 +75,7 @@ def main():
class VisualiseAttentionMap(Callback):

def on_epoch_end(self, epoch, logs=None):
attention_map = get_activations(model, x_test, layer_name='attention_weight')['attention_weight']
attention_map = get_activations(model, x_test, layer_names='attention_weight')['attention_weight']

# top is attention map.
# bottom is ground truth.
Expand Down
64 changes: 64 additions & 0 deletions examples/find_max.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import matplotlib.pyplot as plt
import numpy as np
from keract import get_activations
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dense, LSTM

from attention import Attention


class VisualizeAttentionMap(Callback):

def __init__(self, model, x):
super().__init__()
self.model = model
self.x = x

def on_epoch_begin(self, epoch, logs=None):
attention_map = get_activations(self.model, self.x, layer_names='attention_weight')['attention_weight']
x = self.x[..., 0]
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(5, 6))
maps = [attention_map, create_argmax_mask(attention_map), create_argmax_mask(x)]
maps_names = ['attention layer', 'attention layer - argmax()', 'ground truth - argmax()']
for i, ax in enumerate(axes.flat):
im = ax.imshow(maps[i], interpolation='none', cmap='jet')
ax.set_ylabel(maps_names[i] + '\n#sample axis')
ax.set_xlabel('sequence axis')
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
cbar_ax = fig.add_axes([0.75, 0.15, 0.05, 0.7])
fig.colorbar(im, cax=cbar_ax)
fig.suptitle(f'Epoch {epoch} - training')
plt.show()


def create_argmax_mask(x):
mask = np.zeros_like(x)
for i, m in enumerate(x.argmax(axis=1)):
mask[i, m] = 1
return mask


def main():
seq_length = 10
num_samples = 100000
# https://stats.stackexchange.com/questions/485784/which-distribution-has-its-maximum-uniformly-distributed
# Choose beta(1/N,1) to have max(X_1,...,X_n) ~ U(0, 1) => minimizes amount of knowledge.
# If all the max(s) are concentrated around 1, then it makes the task easy for the model.
x_data = np.random.beta(a=1 / seq_length, b=1, size=(num_samples, seq_length, 1))
y_data = np.max(x_data, axis=1)
model = Sequential([
LSTM(128, input_shape=(seq_length, 1), return_sequences=True),
Attention(name='attention_weight'),
Dense(1, activation='linear')
])
model.compile(loss='mae')
max_epoch = 100
# visualize the attention on the first samples.
visualize = VisualizeAttentionMap(model, x_data[0:12])
model.fit(x_data, y_data, epochs=max_epoch, validation_split=0.2, callbacks=[visualize])


if __name__ == '__main__':
main()
41 changes: 18 additions & 23 deletions examples/imdb.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,37 @@
import numpy
import numpy as np
from tensorflow.keras import Input
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.preprocessing import sequence

from attention import attention_3d_block
from attention import Attention


def train_and_evaluate_model_on_imdb(add_attention=True):
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)
# create the model
embedding_vector_length = 32
i = Input(shape=(max_review_length,))
x = Embedding(top_words, embedding_vector_length, input_length=max_review_length)(i)
x = Dropout(0.5)(x)
if add_attention:
x = LSTM(100, return_sequences=True)(x)
x = attention_3d_block(x)
else:
x = LSTM(100, return_sequences=False)(x)
x = Dense(350, activation='relu')(x) # same number of parameters so fair comparison.
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[i], outputs=[x])
model = Sequential([
Embedding(top_words, embedding_vector_length, input_length=max_review_length),
Dropout(0.5),
# attention vs no attention. same number of parameters so fair comparison.
*([LSTM(100, return_sequences=True), Attention()] if add_attention
else [LSTM(100), Dense(350, activation='relu')]),
Dropout(0.5),
Dense(1, activation='sigmoid')
]
)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Expand All @@ -52,7 +47,7 @@ def on_epoch_end(self, epoch, logs=None):
self.val_losses.append(logs['val_loss'])

rbta = RecordBestTestAccuracy()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[rbta])
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64, callbacks=[rbta])

print(f"Max Test Accuracy: {100 * np.max(rbta.val_accuracies):.2f} %")
print(f"Mean Test Accuracy: {100 * np.mean(rbta.val_accuracies):.2f} %")
Expand Down
Binary file added examples/readme/example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from setuptools import setup

from attention import VERSION

setup(
name='attention',
version='2.2',
description='Keras Attention Many to One',
version=VERSION,
description='Keras Simple Attention',
author='Philippe Remy',
license='Apache 2.0',
long_description_content_type='text/markdown',
long_description=open('README.md').read(),
packages=['attention'],
# manually install tensorflow or tensorflow-gpu
install_requires=[
'numpy>=1.18.1',
'keras>=2.3.1',
'gast>=0.2.2'
'tensorflow>=2.1'
]
)

0 comments on commit 5fa5345

Please sign in to comment.