Train your first Neural Network for Large Scale Text Classification
Classification with Neural Networks is a common practice. When we want to classify for multi-label, we can combine two, three or more labels into a single label (often called flattening
) and train our NN on that. However, in 2014, a paper came out Large-scale Multi-label Text Classification — Revisiting Neural Networks
which detailed it's finding on using simple NN to obtain better multi-label classification results than a traditional Back-Propagation Multi-Label learning model. Find the paper here.
In this tutorial, we will do a similar experiment to implement multi label classification. We will use the arXiv Paper Abstracts dataset to train a simple Dense NN model on Kaggle using Keras. Let's get into the tutorial.
Step 1: Preprocessing
- Convert the string labels to lists of strings
- Use stratified splits because of class imbalance
- Multi-label binarization
#Convert the string labels to lists of strings
arxiv_data_filtered["terms"] = arxiv_data_filtered["terms"].apply(
lambda x: literal_eval(x)
)
# Use stratified splits because of class imbalance
test_split = 0.1
# Initial train and test split.
train_df, test_df = train_test_split(
arxiv_data_filtered,
test_size=test_split,
stratify=arxiv_data_filtered["terms"].values,
)
# Splitting the test set further into validation
# and new test sets.
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)
#Multi-label binarization
terms = tf.ragged.constant(train_df["terms"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(terms)
vocab = lookup.get_vocabulary()
def invert_multi_hot(encoded_labels):
"""Reverse a single multi-hot encoded label to a tuple of vocab terms."""
hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
return np.take(vocab, hot_indices)
Step 2: Dataset Generator Function
max_seqlen = 150
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE
def make_dataset(dataframe, is_train=True):
labels = tf.ragged.constant(dataframe["terms"].values)
label_binarized = lookup(labels).numpy()
dataset = tf.data.Dataset.from_tensor_slices(
(dataframe["summaries"].values, label_binarized)
)
dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
return dataset.batch(batch_size)
# Prepare the dataset
train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)
Step 3: The Model
def make_model():
shallow_mlp_model = keras.Sequential(
[
layers.Dense(512, activation="relu"),
layers.Dense(256, activation="relu"),
layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
] # More on why "sigmoid" has been used here in a moment.
)
return shallow_mlp_model
Step 4: Training
# Need to vectorize data before passing to NN
text_vectorizer = layers.TextVectorization(
max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)
# `TextVectorization` layer needs to be adapted as per the vocabulary from our
# training set.
with tf.device("/CPU:0"):
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))
train_dataset = train_dataset.map(
lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
validation_dataset = validation_dataset.map(
lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
test_dataset = test_dataset.map(
lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
# Training code snippet starts here
epochs = 20
shallow_mlp_model = make_model()
shallow_mlp_model.compile(
loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]
)
history = shallow_mlp_model.fit(
train_dataset, validation_data=validation_dataset, epochs=epochs
)
Step 5: Evaluation
def plot_result(item):
plt.plot(history.history[item], label=item)
plt.plot(history.history["val_" + item], label="val_" + item)
plt.xlabel("Epochs")
plt.ylabel(item)
plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
plt.legend()
plt.grid()
plt.show()
plot_result("loss")
plot_result("categorical_accuracy")
_, categorical_acc = shallow_mlp_model.evaluate(test_dataset)
print(f"Categorical accuracy on the test set: {round(categorical_acc * 100, 2)}%.")

We obtain a test accuracy of around 84%
.
[Optional]
Find the public notebook here.
[Bonus]
The above blog is an excerpt from a Keras tutorial by Sayak Paul & Soumik Rakshit. Keras tutorials have lots of awesome tutorials and code recipes, check them out here.