In this project, I will use IMDB movie reviews. This dataset contains 50,000 movie's reviews from IMDB, labeled by sentiment (positive/negative). The dataset can be loaded and splitted into training and test sets as the following.
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data()
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
Let us have a look at the first sample of training set.
print(X_train[0])
As it clear, the text of reviews is integer-encoded, where each integer represents a specific word in the dictionary.
We can convert the integers back to words as the following.
INDEX_FROM = 3
word_index = imdb.get_word_index()
word_index = {key:(value+INDEX_FROM) for key,value in word_index.items()}
word_index["<PAD>"] = 0 # the padding token
word_index["<START>"] = 1 # the starting token
word_index["<UNK>"] = 2 # the unknown token
reverse_word_index = {value:key for key, value in word_index.items()}
def decode_review(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
decode_review(X_train[0])
In continue, I will only consider the top 5,000 most common words. I will also consider 20% of the training set for validation purpose.
vocab_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words= vocab_size)
X_train, X_val = X_train[:-5000], X_train[-5000:]
y_train, y_val = y_train[:-5000], y_train[-5000:]
print(len(X_train), 'train sequences')
print(len(X_val), 'val sequences')
print(len(X_test), 'test sequences')
Let us inspect how the first review looks like when we only consider the top 5,000 frequent words.
decode_review(X_train[0])
Movie reviews can be different lengths. We will use the pad_sequences function to standardize the lengths of the reviews.
from keras.preprocessing.sequence import pad_sequences
maximum_sequence_length = 500 # maximum length of all review sequences
X_train = pad_sequences(X_train, value= word_index["<PAD>"], padding= 'post', maxlen= maximum_sequence_length)
X_val = pad_sequences(X_val, value= word_index["<PAD>"], padding= 'post', maxlen= maximum_sequence_length)
X_test = pad_sequences(X_test, value= word_index["<PAD>"], padding= 'post', maxlen= maximum_sequence_length)
print('X_train shape:', X_train.shape) # (n_samples, n_timesteps)
print('X_val shape:', X_val.shape)
print('X_test shape:', X_test.shape)
Let us check the first padded review.
print(X_train[0])
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import ParameterGrid
from keras.callbacks import EarlyStopping
embedding_dim = 16
def create_model(filters = 64, kernel_size = 3, strides=1, units = 256,
optimizer='adam', rate = 0.25, kernel_initializer ='glorot_uniform'):
model = Sequential()
# Embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length= maximum_sequence_length))
# Convolutional Layer(s)
model.add(Dropout(rate))
model.add(Conv1D(filters = filters, kernel_size = kernel_size, strides= strides,
padding='same', activation= 'relu'))
model.add(GlobalMaxPooling1D())
# Dense layer(s)
model.add(Dense(units = units, activation= 'relu', kernel_initializer= kernel_initializer))
model.add(Dropout(rate))
# Output layer
model.add(Dense(1, activation= 'sigmoid'))
# Compile the model
model.compile(loss='binary_crossentropy',
optimizer= optimizer,
metrics=['accuracy'])
return model
# Build the model
model = KerasClassifier(build_fn= create_model)
Now, it is time to tweak hyperparameters to imporve accuracy over validation set.
# Set the hyperparameters
filters = [128] #[64, 128, 256]
kernel_size = [5] #[3, 5, 7]
strides= [1] # [1, 2, 5]
Dense_units = [128, 512]
kernel_initializer = ['TruncatedNormal'] #['zero', 'glorot_uniform', 'glorot_normal','TruncatedNormal']
rate_dropouts = [0.25] #[0.1, 0.25, 0.5]
optimizers = ['adam'] #['adam','rmsprop']
epochs = [5]
batches = [64] #[32, 64, 128]
# ----------------------------------------------
# Exhaustive Grid Search
param_grid = dict(optimizer= optimizers, epochs= epochs, batch_size= batches,
filters = filters, kernel_size = kernel_size, strides = strides,
units = Dense_units, kernel_initializer= kernel_initializer, rate = rate_dropouts)
grid = ParameterGrid(param_grid)
param_sets = list(grid)
param_scores = []
for params in grid:
print(params)
model.set_params(**params)
earlystopper = EarlyStopping(monitor='val_acc', patience= 0, verbose=1)
history = model.fit(X_train, y_train,
shuffle= True,
validation_data=(X_val, y_val),
callbacks= [earlystopper])
param_score = history.history['val_acc']
param_scores.append(param_score[-1])
print('+-'*50)
print('param_scores:', param_scores)
print("best score:", param_scores[p])
# Choose best parameters
p = np.argmax(np.array(param_scores))
best_params = param_sets[p]
print("best parameter set", best_params)
Here, I train the model with the best obtained hyperparameters over train + validation sets.
model.set_params(**best_params)
model.fit(np.vstack((X_train, X_val)), np.hstack((y_train, y_val)))
Finally, I evaluate performance of the trained model over unsean test set.
print("Test accuracy = %f%%" % (accuracy_score(y_test, model.predict(X_test))*100))