Train your first Neural Network for Named Entity Recognition
Named Entity Recognition or simply, NER
is an information extraction technique used in search efficiency, recommendation engines, customer service, automatic trading, and many more. It seeks to locate & classify named entities mentioned in unstructured text into pre-defined categories such as organizations, proper nouns, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.
In this tutorial, we are going to train a BiLSTM network with a NER dataset. This is the extract from GMB corpus which is tagged, annotated and built specifically to train the classifier to predict named entities such as name, location, etc.
Let's get into it.
Step 1: Load and preprocess the dataset
We will be doing following preprocessing operations:
- Fill
NA
(null values) withffill
- Tokenize words in each sentence to a
token_id
- Pad each sentence to length:
max_len
in order to make up for training through BiLSTM network - One-hot encode the entity mapping to make up for our cost computation at the end of the network.
data = pd.read_csv('../input/entity-annotated-corpus/ner_dataset.csv', encoding= 'unicode_escape')
data.head()
def get_dict_map(data, token_or_tag):
tok2idx = {}
idx2tok = {}
if token_or_tag == 'token':
vocab = list(set(data['Word'].to_list()))
else:
vocab = list(set(data['Tag'].to_list()))
idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)
data_group = data_fillna.groupby(['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
Step 2: Partition the dataset into Train, Validation & Test set
def get_pad_train_test_val(data_group, data):
n_token = len(list(set(data['Word'].to_list())))
n_tag = len(list(set(data['Tag'].to_list())))
tokens = data_group['Word_idx'].tolist()
maxlen = max([len(s) for s in tokens])
pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
tags = data_group['Tag_idx'].tolist()
pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
n_tags = len(tag2idx)
pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
print(
'train_tokens length:', len(train_tokens),
'\ntest_tokens length:', len(test_tokens),
'\ntrain_tags:', len(train_tags),
'\ntest_tags:', len(test_tags)
)
return train_tokens, test_tokens, train_tags, test_tags
train_tokens, test_tokens, train_tags, test_tags = get_pad_train_test_val(data_group, data)
Step 3: Build the Model
def get_bilstm_lstm_model():
model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model.add(TimeDistributed(Dense(n_tags, activation="relu")))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
Step 4: Training
def train_model(X, y, model):
loss = list()
for _ in range(25):
# fit model for one epoch on this sequence
hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
loss.append(hist.history['loss'][0])
return loss
model_bilstm_lstm = get_bilstm_lstm_model()
results = pd.DataFrame()
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model)
Step 5: Evaluation
plt.plot(results['with_add_lstm'])
plt.xlabel('Epochs')
plt.ylabel('Loss')

text = nlp(
'Jim bought 300 shares of Acme Corp. in 2006. And producing an annotated block of text that \
highlights the names of entities: [Jim]Person bought 300 shares of \
[Acme Corp.]Organization in [2006]Time. In this example, a person name consisting \
of one token, a two-token company name and a temporal expression have been detected \
and classified.State-of-the-art NER systems for English produce near-human performance. \
For example, the best system entering MUC-7 scored 93.39% of F-measure while human \
annotators scored 97.60% and 96.95%.[1][2]')
displacy.render(text, style = 'ent', jupyter=True)

[Optional]
Checkout the public notebook with all the above implementation:
[Bonus]
This blog is focused as a short tutorial, checkout the below tutorial for detailed explanations:

Cheers!