import re
import nltk
import numpy as np
import pandas as pd
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
stop_words = stopwords.words('english')

Sentence Tokenisation

sentences = []
for s in df['article_text']:

Process Sentences

# Text processing
sentences = [y for x in sentences for y in x] # flatten list

# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

Loading and Applying Word Embeddings (GLoVe)

You can install the GloVe embeddings here:

# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs

sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    v = np.zeros((100,))

Constructing Similarity Matrix

We are constructing the similarity matrix using the cosine similarity. Here’s where you can experiment with different similarity function!

sim_mat = np.zeros([len(sentences), len(sentences)])

for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

Apply PageRank to Similarity Matrix and Select Top Salient Sentences

This is where we convert the similarity matrix into a graph and apply PageRank algorithm to it to get ranked sentences.

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

# Extract top 10 sentences as the summary
for i in range(10):


Data Scientist

Leave a Reply