In this project, I will use IMDB movie reviews. This dataset contains 50,000 movie's reviews from IMDB, labeled by sentiment (positive/negative). The dataset can be loaded and splitted into training and test sets as the following.
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data()
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
Let us have a look at the first sample of training set.
As it clear, the text of reviews is integer-encoded, where each integer represents a specific word in the dictionary.
We can convert the integers back to words as the following.
word_index = imdb.get_word_index()
word_index = {key:(value+INDEX_FROM) for key,value in word_index.items()}
word_index["<PAD>"] = 0 # the padding token
word_index["<START>"] = 1 # the starting token
word_index["<UNK>"] = 2 # the unknown token
reverse_word_index = {value:key for key, value in word_index.items()}
def decode_review(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
In continue, I will only consider the top 5,000 most common words. I will also consider 20% of the training set for validation purpose.
vocab_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words= vocab_size)
X_train, X_val = X_train[:-5000], X_train[-5000:]
y_train, y_val = y_train[:-5000], y_train[-5000:]
print(len(X_train), 'train sequences')
print(len(X_val), 'val sequences')
print(len(X_test), 'test sequences')
Let us inspect how the first review looks like when we only consider the top 5,000 frequent words.
Movie reviews can be different lengths. We will use the pad_sequences function to standardize the lengths of the reviews.
from keras.preprocessing.sequence import pad_sequences
maximum_sequence_length = 500 # maximum length of all review sequences
X_train = pad_sequences(X_train, value= word_index["<PAD>"], padding= 'post', maxlen= maximum_sequence_length)
X_val = pad_sequences(X_val, value= word_index["<PAD>"], padding= 'post', maxlen= maximum_sequence_length)
X_test = pad_sequences(X_test, value= word_index["<PAD>"], padding= 'post', maxlen= maximum_sequence_length)
print('X_train shape:', X_train.shape) # (n_samples, n_timesteps)
print('X_val shape:', X_val.shape)
print('X_test shape:', X_test.shape)
Let us check the first padded review.
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import ParameterGrid
from keras.callbacks import EarlyStopping
embedding_dim = 16
def create_model(filters = 64, kernel_size = 3, strides=1, units = 256,
optimizer='adam', rate = 0.25, kernel_initializer ='glorot_uniform'):
model = Sequential()
# Embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length= maximum_sequence_length))
# Convolutional Layer(s)
model.add(Conv1D(filters = filters, kernel_size = kernel_size, strides= strides,
padding='same', activation= 'relu'))
# Dense layer(s)
model.add(Dense(units = units, activation= 'relu', kernel_initializer= kernel_initializer))
# Output layer
model.add(Dense(1, activation= 'sigmoid'))
# Compile the model
optimizer= optimizer,
return model
# Build the model
model = KerasClassifier(build_fn= create_model)
Now, it is time to tweak hyperparameters to imporve accuracy over validation set.
# Set the hyperparameters
filters = [128] #[64, 128, 256]
kernel_size = [5] #[3, 5, 7]
strides= [1] # [1, 2, 5]
Dense_units = [128, 512]
kernel_initializer = ['TruncatedNormal'] #['zero', 'glorot_uniform', 'glorot_normal','TruncatedNormal']
rate_dropouts = [0.25] #[0.1, 0.25, 0.5]
optimizers = ['adam'] #['adam','rmsprop']
epochs = [5]
batches = [64] #[32, 64, 128]
# ----------------------------------------------
# Exhaustive Grid Search
param_grid = dict(optimizer= optimizers, epochs= epochs, batch_size= batches,
filters = filters, kernel_size = kernel_size, strides = strides,
units = Dense_units, kernel_initializer= kernel_initializer, rate = rate_dropouts)
grid = ParameterGrid(param_grid)
param_sets = list(grid)
param_scores = []
for params in grid:
earlystopper = EarlyStopping(monitor='val_acc', patience= 0, verbose=1)
history =, y_train,
shuffle= True,
validation_data=(X_val, y_val),
callbacks= [earlystopper])
param_score = history.history['val_acc']
print('param_scores:', param_scores)
print("best score:", param_scores[p])
# Choose best parameters
p = np.argmax(np.array(param_scores))
best_params = param_sets[p]
print("best parameter set", best_params)
Here, I train the model with the best obtained hyperparameters over train + validation sets.
model.set_params(**best_params), X_val)), np.hstack((y_train, y_val)))
Finally, I evaluate performance of the trained model over unsean test set.
print("Test accuracy = %f%%" % (accuracy_score(y_test, model.predict(X_test))*100))