This is the final blog post of the multi-series TFIDF for summarisation implementation. I have listed all the libraries dependencies for this code file as well as a simple helper function to read in the CNN/Daily Mail dataset, which has been provided to us in text files. In the code below, I have only shown the application of the TFIDF class to the CNN/Daily Mail test set but it’s pretty much the same process for the train and validation set.

I have outlined the entire TFIDF class again at the bottom of this blog post for completeness and ease of reference.

Dependencies & Helper Function

import os
import re
import math
import nltk
import operator
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize

tokenizer = RegexpTokenizer(r'\w+')
def read_data(filename):
    tmp = []
    with open(filename, "r") as f:
        text = f.readline()
        while text:
            text = f.readline()
    return pd.DataFrame(tmp)

if __name__ == “__main__”

if __name__ == "__main__":
    # Read data
    test_df = read_data('')
    test_df = test_df[test_df[0] != '\n']
    test_df.reset_index(inplace = True, drop = True)

    # TFIDF each article (row by row)
    test_tfidf = []
    for index, row in test_df.iterrows():
        if len(row[0].split(' ')) > 450:
            TFIDF_extractor = TFIDF_single_doc_extractor(row[0], True)
            sentence_list = TFIDF_extractor.read_and_sent_tokenise()
            tf_idf_matrix = TFIDF_extractor.create_tf_idf_matrix()
            sentence_value_dict = TFIDF_extractor.score_sentences(tf_idf_matrix, using_pos_tag = True)
            test_tfidf.append(TFIDF_extractor.generate_summary_by_top_sentences(sentence_list, sentence_value_dict, 0.5))
    test_df['tfidf'] = test_tfidf
    test_df['tfidf'] = test_df['tfidf'].apply(lambda x: x.strip())

Above is a simple example of how to use the TFIDF class to summarise a dataset of news articles. Once we have read in the test set using our read_data() helper function, we will iterate through each article (each row). For each article, we will:

  1. Initiate the TFIDF class with the article as input
  2. Execute read_sent_tokenise() method to split the article into sentences
  3. Execute tf_idf_matrix() method to calculate the TFIDF for each word in each sentence
  4. Execute score_sentences() method to score each sentence. With using_pos_tag argument set to True, we will only use the TFIDF score of the nouns and verbs to compute the final score of each sentence
  5. Execute either generate_summary_by_avg() or generate_summary_by_top_sentences() method to generate the final summary
  6. Save the summary to test_tfidf array

Once we have looped through all the articles in the test set, our test_tfidf array should contain the summary of all the articles in the test set.

Our TFIDF Class

class TFIDF_single_doc_extractor():
    def __init__(self, document, isText = False):
        self.document = document
        self.isText = isText
        self.sentences = None
        self.total_sentences = None

        self.freq_matrix = {}
        self.tf_matrix = {}
        self.word_occurence_table = {}
        self.idf_matrix = {}

        self.sentence_value = {}
        self.average_sentence_value = 0
        self.summary = ''
        self.sentence_count = 0
    def read_and_sent_tokenise(self):
        if self.isText == False:
            text = open(self.document , encoding='utf-8')
            text =
            text = self.document
        self.sentences = sent_tokenize(text)
        self.total_sentences = len(self.sentences)
        return self.sentences
    def create_frequency_matrix(self):
        _stopwords = set(stopwords.words('english'))
        wordnet = WordNetLemmatizer()
        for sentence in self.sentences:
            freq_table = {}
            words = tokenizer.tokenize(sentence)
            for word in words:
                word = word.lower()
                word = wordnet.lemmatize(word)
                if word in _stopwords:
                if word in freq_table:
                    freq_table[word] += 1
                    freq_table[word] = 1
            self.freq_matrix[sentence] = freq_table
    def create_tf_matrix(self):
        for sentence, freq_table in self.freq_matrix.items():
            tf_table = {}
            no_words_in_sentence = len(freq_table)
            for word, count in freq_table.items():
                tf_table[word] = count / no_words_in_sentence
            self.tf_matrix[sentence] = tf_table

    def create_sentences_per_words(self):
        for sentence, freq_table in self.freq_matrix.items():
            for word, count in freq_table.items():
                if word in self.word_occurence_table:
                    self.word_occurence_table[word] += 1
                    self.word_occurence_table[word] = 1

    def create_idf_matrix(self):
        for sentence, freq_table in self.freq_matrix.items():
            idf_table = {}
            for word in freq_table.keys():
                idf_table[word] = math.log10(self.total_sentences / float(self.word_occurence_table[word]))
            self.idf_matrix[sentence] = idf_table
    def create_tf_idf_matrix(self):
        for (sentence1, freq_table1), (sentence2, freq_table2) in zip(self.tf_matrix.items(), self.idf_matrix.items()):
            tf_idf_table = {}
            for (word1, value1), (word2, value2) in zip(freq_table1.items(), freq_table2.items()):
                tf_idf_table[word1] = float(value1 * value2)
            self.tf_idf_matrix[sentence1] = tf_idf_table
        return self.tf_idf_matrix

    def score_sentences(self, tf_idf_matrix, using_pos_tag = False):
        for sentence, freq_table in tf_idf_matrix.items():
            total_score_sentence = 0
            no_words_in_sentence = len(freq_table)
            if using_pos_tag:
                pos_tag = nltk.pos_tag(tokenizer.tokenize(sentence))
                pos_tagged_noun_verb = []
                for word, tag in pos_tag:
                    if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            for word, score in freq_table.items():
                if using_pos_tag:
                    if word in pos_tagged_noun_verb:
                        total_score_sentence += score
                    total_score_sentence += score
                self.sentence_value[sentence] = total_score_sentence / no_words_in_sentence
                self.sentence_value[sentence] = 0
        return self.sentence_value
    def find_average_sentence_score(self, sentence_value_dict):
        total_value_all_sentences = 0
        for sentence in sentence_value_dict:
            total_value_all_sentences += sentence_value_dict[sentence]
        self.average_sentence_value = total_value_all_sentences / len(sentence_value_dict)
        return self.average_sentence_value
    def generate_summary_by_avg(self, sentences, sentence_value_dict, threshold):
        for sentence in sentences:
            if sentence in sentence_value_dict and sentence_value_dict[sentence] >= (threshold):
                self.summary += " " + sentence
                self.sentence_count += 1
        return self.summary
    def generate_summary_by_top_sentences(self, sentences, sentence_value_dict, top_sentences_ratio):
        final_sentence_list = []
        sorted_d = sorted(sentence_value_dict.items(), key=operator.itemgetter(1), reverse = True)
        top_sentences = round(top_sentences_ratio * len(sorted_d))
        for key, value in sorted_d[:top_sentences]:
        for sentence in sentences:
            if sentence in final_sentence_list:
                self.summary += " " + sentence
                self.sentence_count += 1

        return self.summary


Data Scientist

Leave a Reply