Spacy Matcher
15th January 2026
Spacy Matcher
Lets say you want to write a rule based (regex like) program and also want leverage linguistic features such as POS, dependencies parsing etc.
Spacy's matcher provides a powerful and simple way to achieve this goal.
# import libraries
import spacy
from spacy.matcher import Matcher
from datetime import datetime
# load model
nlp = spacy.load("en_core_web_sm")
Spacy Matcher Basics (token level)
sample_document1 = "The quick brown fox jumps over the lazy dog."
sample_document2 = "The quick black fox jumps over the lazy monkey"
# find regex patterns using matcher
matcher = Matcher(nlp.vocab)
# find tokens matching fox
pattern1 = [[{"LOWER": "fox"}]]
# find tokens matching the root word "jump"
pattern2 = [[{"LEMMA": "jump"}]]
# find tokens matching a noun folloewed by lazy
pattern3 = [[{"LOWER": "lazy"}, {"POS": "NOUN"}]
]
# find regex patterns
pattern4 = [[{"LOWER": "quick"}, {"TEXT": {"REGEX": "b(rown|lack)"}}]]
matcher.add("fox", pattern1)
matcher.add("jump", pattern2)
matcher.add("lazyanimals", pattern3)
matcher.add("quick{regex}", pattern4)
for document in [sample_document1, sample_document2]:
print("Proceesing document: ", document)
tik = datetime.now()
doc = nlp(document)
matches = matcher(doc)
print(f"{'span':20}\tstart\tend\tMatched Pattern")
print(f"{'-'*20:20}\t{'-'*5}\t{'-'*5}\t{'-'*20}")
for match in matches:
id, start, end = match
span = doc[start:end]
print(f"{span.text:20}\t{span.start_char}\t{span.end_char}\t{nlp.vocab.strings[id]}")
print("\n")
tok = datetime.now()
print(f"Time taken: {(tok-tik)}\n")
Spacy Phrase Matcher
Match a large a list of terms - fast
from spacy.matcher import PhraseMatcher
phrase_matcher = PhraseMatcher(nlp.vocab)
terms = ["fox", "lazy dog", "lazy monkey"]
patterns = [nlp.make_doc(text) for text in terms]
phrase_matcher.add("eatlazyanimals", patterns)
for document in [sample_document1, sample_document2]:
print("Proceesing document: ")
tik = datetime.now()
doc = nlp(document)
matches = phrase_matcher(doc)
print(f"{'span':20}\tstart\tend\tMatched Pattern")
print(f"{'-'*20:20}\t{'-'*5}\t{'-'*5}\t{'-'*20}")
for match in matches:
id, start, end = match
span = doc[start:end]
print(f"{span.text:20}\t{span.start_char}\t{span.end_char}\t{nlp.vocab.strings[id]}")
print("\n")
tok = datetime.now()
print(f"Time taken: {(tok-tik)}\n")