Rohan Singh's Weblog

Writing about software, data science, and things I learn along the way.

Spacy Matcher

python

Spacy Matcher

Lets say you want to write a rule based (regex like) program and also want leverage linguistic features such as POS, dependencies parsing etc.

Spacy's matcher provides a powerful and simple way to achieve this goal.

# import libraries
import spacy
from spacy.matcher import Matcher
from datetime import datetime

# load model
nlp = spacy.load("en_core_web_sm")

Spacy Matcher Basics (token level)

Full Documentation

sample_document1 = "The quick brown fox jumps over the lazy dog."
sample_document2 = "The quick black fox jumps over the lazy monkey"

# find regex patterns using matcher
matcher = Matcher(nlp.vocab)

# find tokens matching fox
pattern1 = [[{"LOWER": "fox"}]] 

# find tokens matching the root word "jump"
pattern2 = [[{"LEMMA": "jump"}]]

# find tokens matching a noun folloewed by lazy
pattern3 = [[{"LOWER": "lazy"}, {"POS": "NOUN"}]
]

# find regex patterns
pattern4 = [[{"LOWER": "quick"}, {"TEXT": {"REGEX": "b(rown|lack)"}}]]

matcher.add("fox", pattern1)
matcher.add("jump", pattern2)
matcher.add("lazyanimals", pattern3)
matcher.add("quick{regex}", pattern4)

for document in [sample_document1, sample_document2]:
    print("Proceesing document: ", document)
    tik = datetime.now()
    doc = nlp(document)
    matches = matcher(doc)

    print(f"{'span':20}\tstart\tend\tMatched Pattern")
    print(f"{'-'*20:20}\t{'-'*5}\t{'-'*5}\t{'-'*20}")
    for match in matches:
        id, start, end = match
        span = doc[start:end]
        print(f"{span.text:20}\t{span.start_char}\t{span.end_char}\t{nlp.vocab.strings[id]}")
    print("\n")
    tok = datetime.now()
    print(f"Time taken: {(tok-tik)}\n")

Spacy Phrase Matcher

Match a large a list of terms - fast

Full Documentation

from spacy.matcher import PhraseMatcher

phrase_matcher = PhraseMatcher(nlp.vocab)
terms = ["fox", "lazy dog", "lazy monkey"]

patterns = [nlp.make_doc(text) for text in terms]

phrase_matcher.add("eatlazyanimals", patterns)

for document in [sample_document1, sample_document2]:
    print("Proceesing document: ")
    tik = datetime.now()
    doc = nlp(document)
    matches = phrase_matcher(doc)

    print(f"{'span':20}\tstart\tend\tMatched Pattern")
    print(f"{'-'*20:20}\t{'-'*5}\t{'-'*5}\t{'-'*20}")
    for match in matches:
        id, start, end = match
        span = doc[start:end]
        print(f"{span.text:20}\t{span.start_char}\t{span.end_char}\t{nlp.vocab.strings[id]}")
    print("\n")
    tok = datetime.now()
    print(f"Time taken: {(tok-tik)}\n")

← Back to TIL