Knowledge Graph Implementation

Found this Medium’s tutorial on how to implement a simple knowledge graph. Consider this to be my “hello world” to knowledge graph 🙂 Steps to represent knowledge in a graph:

  1. Sentence segmentation
  2. Entities extraction
  3. Relations extraction

Load dependencies

In [ ]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

Reading data file

In [2]:
candidate_sentences = pd.read_csv("wiki_sentences_v2.csv")
In [3]:
candidate_sentences.head()
Out[3]:
sentence
0 confused and frustrated, connie decides to leave on her own.
1 later, a woman’s scream is heard in the distance.
2 christian is then paralyzed by an elder.
3 the temple is set on fire.
4 outside, the cult wails with him.
In [4]:
candidate_sentences.shape
Out[4]:
(4318, 1)

Entity Pairs Extraction

  • We need to extract the subject and object from these sentences
In [5]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]
In [6]:
entity_pairs = []

for i in tqdm(candidate_sentences["sentence"]):
  entity_pairs.append(get_entities(i))
100%|██████████| 4318/4318 [00:45<00:00, 95.95it/s] 
In [8]:
entity_pairs[10:20]
Out[8]:
[['we', 'tests'],
 ['global', 'international sales rights'],
 ['robbie  robertson', 'soundtrack'],
 ['it', 'original music tracks'],
 ['it', 'reviewed  franchise'],
 ['she', 'accidentally  mystique'],
 ['military  forces', 'arrest'],
 ['train', 'vuk'],
 ['kota eberhardt', 'telepath selene gallio'],
 ['singer', 'sequel']]

Relation / Predicate Extraction

  • The hypothesis here is that the predicate is the main verb in the sentence
In [9]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)
In [11]:
relations = [get_relation(i) for i in 
             tqdm(candidate_sentences['sentence'])]
100%|██████████| 4318/4318 [00:47<00:00, 90.82it/s]

Build a Knowledge Graph

In [13]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
In [14]:
kg_df.head()
Out[14]:
source target edge
0 connie own decides
1 later woman distance heard in
2 christian then elder paralyzed by
3 temple fire set on
4 outside cult him wails with

Focusing on specific relations for better visualisation

In [17]:
all_relations = pd.Series(relations).value_counts(); all_relations
Out[17]:
is                   368
was                  283
released on           79
are                   72
include               69
                    ... 
act as                 1
hired for              1
funded through         1
earned                 1
served as overall      1
Length: 1669, dtype: int64
In [18]:
def plot_kg_specific_relations(relation):    
    G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==relation], "source", "target", 
                              edge_attr=True, create_using=nx.MultiDiGraph())

    plt.figure(figsize=(12,12))
    pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
    nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
    plt.show()
In [35]:
def plot_kg_specific_entity(entity):
    G=nx.from_pandas_edgelist(kg_df[kg_df['source']==entity], "source", "target", 
                              edge_attr=True, create_using=nx.MultiDiGraph())

    plt.figure(figsize=(12,12))
    pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
    nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
    plt.show()
In [19]:
plot_kg_specific_relations("composed by")
In [22]:
plot_kg_specific_relations("released on")
In [37]:
plot_kg_specific_entity('g. george')
In [38]:
plot_kg_specific_entity('schwarzenegger')
In [39]:
plot_kg_specific_entity('arthur')
Ryan

Ryan

Data Scientist

Leave a Reply