Skip to content

Instantly share code, notes, and snippets.

@NoRaincheck
Created February 22, 2026 05:09
Show Gist options
  • Select an option

  • Save NoRaincheck/ed5c5db72f34b3eb6e8566b22d0c7788 to your computer and use it in GitHub Desktop.

Select an option

Save NoRaincheck/ed5c5db72f34b3eb6e8566b22d0c7788 to your computer and use it in GitHub Desktop.
Microbenchmark of bm25 and difflib. difflib is much, much slower, but probably 'good enough' for small contexts/corpuses. Stop words makes a big difference with speed and performance
# /// script
# requires-python = ">=3.14"
# dependencies = [
# "beir>=2.2.0",
# "rank-bm25>=0.2.2",
# ]
# ///
import datetime
import logging
import os
import pathlib
from beir import LoggingHandler, util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from rank_bm25 import BM25Okapi
from tqdm import tqdm
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
handlers=[LoggingHandler()],
)
#### /print debug information to stdout
dataset = "quora"
url = (
f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
)
out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
#### Loading test queries and corpus in DBPedia
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
corpus_ids, query_ids = list(corpus), list(queries)
#### Tokenize corpus
corpus_texts = [corpus[corpus_id]["text"] for corpus_id in corpus_ids]
tokenized_corpus = [doc.split(" ") for doc in corpus_texts]
#### Index all passages into BM25
bm25 = BM25Okapi(tokenized_corpus)
#### Retrieve results for all queries
results = {}
time_taken_all = {}
dataset_size = 100
for query_id in tqdm(query_ids[:dataset_size]):
query = queries[query_id]
#### Measure time to retrieve top-100 BM25 documents
start = datetime.datetime.now()
tokenized_query = query.split(" ")
scores = bm25.get_scores(tokenized_query)
top_n_indices = scores.argsort()[-100:][::-1]
end = datetime.datetime.now()
#### Map indices back to corpus IDs
results[query_id] = {corpus_ids[idx]: float(scores[idx]) for idx in top_n_indices}
#### Measuring time taken in ms (milliseconds)
time_taken = end - start
time_taken = time_taken.total_seconds() * 1000
time_taken_all[query_id] = time_taken
# logging.info(f"{query_id}: {query} {time_taken:.2f}ms")
time_taken = list(time_taken_all.values())
logging.info(f"Average time taken: {sum(time_taken) / len(time_taken_all):.2f}ms")
#### Evaluate retrieval performance
evaluator = EvaluateRetrieval()
ndcg, map_score, recall, precision = evaluator.evaluate(
qrels, results, evaluator.k_values
)
logging.info(f"NDCG@10: {ndcg['NDCG@10']:.4f}")
logging.info(f"MAP@100: {map_score['MAP@100']:.4f}")
logging.info(f"Recall@100: {recall['Recall@100']:.4f}")
# 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:34<00:00, 2.88it/s]
# 2026-02-22 16:05:51 - Average time taken: 347.32ms
# 2026-02-22 16:05:51 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
# 2026-02-22 16:05:51 -
# 2026-02-22 16:05:51 - NDCG@1: 0.5000
# 2026-02-22 16:05:51 - NDCG@3: 0.5508
# 2026-02-22 16:05:51 - NDCG@5: 0.5712
# 2026-02-22 16:05:51 - NDCG@10: 0.5877
# 2026-02-22 16:05:51 - NDCG@100: 0.6257
# 2026-02-22 16:05:51 - NDCG@1000: 0.6257
# 2026-02-22 16:05:51 -
# 2026-02-22 16:05:51 - MAP@1: 0.3982
# 2026-02-22 16:05:51 - MAP@3: 0.4972
# 2026-02-22 16:05:51 - MAP@5: 0.5161
# 2026-02-22 16:05:51 - MAP@10: 0.5302
# 2026-02-22 16:05:51 - MAP@100: 0.5415
# 2026-02-22 16:05:51 - MAP@1000: 0.5415
# 2026-02-22 16:05:51 -
# 2026-02-22 16:05:51 - Recall@1: 0.3982
# 2026-02-22 16:05:51 - Recall@3: 0.5725
# 2026-02-22 16:05:51 - Recall@5: 0.6299
# 2026-02-22 16:05:51 - Recall@10: 0.6869
# 2026-02-22 16:05:51 - Recall@100: 0.8259
# 2026-02-22 16:05:51 - Recall@1000: 0.8259
# 2026-02-22 16:05:51 -
# 2026-02-22 16:05:51 - P@1: 0.5000
# 2026-02-22 16:05:51 - P@3: 0.2600
# 2026-02-22 16:05:51 - P@5: 0.1840
# 2026-02-22 16:05:51 - P@10: 0.1060
# 2026-02-22 16:05:51 - P@100: 0.0153
# 2026-02-22 16:05:51 - P@1000: 0.0015
# 2026-02-22 16:05:51 - NDCG@10: 0.5877
# 2026-02-22 16:05:51 - MAP@100: 0.5415
# 2026-02-22 16:05:51 - Recall@100: 0.8259
# /// script
# requires-python = ">=3.14"
# dependencies = [
# "beir>=2.2.0",
# ]
# ///
import datetime
import difflib
import logging
import os
import pathlib
import string
from beir import LoggingHandler, util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from tqdm import tqdm
STOP_WORDS = frozenset(
[
"a",
"about",
"above",
"across",
"after",
"afterwards",
"again",
"against",
"all",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"am",
"among",
"amongst",
"amoungst",
"amount",
"an",
"and",
"another",
"any",
"anyhow",
"anyone",
"anything",
"anyway",
"anywhere",
"are",
"around",
"as",
"at",
"back",
"be",
"became",
"because",
"become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
"behind",
"being",
"below",
"beside",
"besides",
"between",
"beyond",
"bill",
"both",
"bottom",
"but",
"by",
"call",
"can",
"cannot",
"cant",
"co",
"con",
"could",
"couldnt",
"cry",
"de",
"describe",
"detail",
"do",
"done",
"down",
"due",
"during",
"each",
"eg",
"eight",
"either",
"eleven",
"else",
"elsewhere",
"empty",
"enough",
"etc",
"even",
"ever",
"every",
"everyone",
"everything",
"everywhere",
"except",
"few",
"fifteen",
"fifty",
"fill",
"find",
"fire",
"first",
"five",
"for",
"former",
"formerly",
"forty",
"found",
"four",
"from",
"front",
"full",
"further",
"get",
"give",
"go",
"had",
"has",
"hasnt",
"have",
"he",
"hence",
"her",
"here",
"hereafter",
"hereby",
"herein",
"hereupon",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"however",
"hundred",
"i",
"ie",
"if",
"in",
"inc",
"indeed",
"interest",
"into",
"is",
"it",
"its",
"itself",
"keep",
"last",
"latter",
"latterly",
"least",
"less",
"ltd",
"made",
"many",
"may",
"me",
"meanwhile",
"might",
"mill",
"mine",
"more",
"moreover",
"most",
"mostly",
"move",
"much",
"must",
"my",
"myself",
"name",
"namely",
"neither",
"never",
"nevertheless",
"next",
"nine",
"no",
"nobody",
"none",
"noone",
"nor",
"not",
"nothing",
"now",
"nowhere",
"of",
"off",
"often",
"on",
"once",
"one",
"only",
"onto",
"or",
"other",
"others",
"otherwise",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"part",
"per",
"perhaps",
"please",
"put",
"rather",
"re",
"same",
"see",
"seem",
"seemed",
"seeming",
"seems",
"serious",
"several",
"she",
"should",
"show",
"side",
"since",
"sincere",
"six",
"sixty",
"so",
"some",
"somehow",
"someone",
"something",
"sometime",
"sometimes",
"somewhere",
"still",
"such",
"system",
"take",
"ten",
"than",
"that",
"the",
"their",
"them",
"themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
"therefore",
"therein",
"thereupon",
"these",
"they",
"thick",
"thin",
"third",
"this",
"those",
"though",
"three",
"through",
"throughout",
"thru",
"thus",
"to",
"together",
"too",
"top",
"toward",
"towards",
"twelve",
"twenty",
"two",
"un",
"under",
"until",
"up",
"upon",
"us",
"very",
"via",
"was",
"we",
"well",
"were",
"what",
"whatever",
"when",
"whence",
"whenever",
"where",
"whereafter",
"whereas",
"whereby",
"wherein",
"whereupon",
"wherever",
"whether",
"which",
"while",
"whither",
"who",
"whoever",
"whole",
"whom",
"whose",
"why",
"will",
"with",
"within",
"without",
"would",
"yet",
"you",
"your",
"yours",
"yourself",
"yourselves",
]
)
def remove_stopwords(text):
words = text.lower().translate(str.maketrans("", "", string.punctuation)).split()
return " ".join(w for w in words if w not in STOP_WORDS)
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
handlers=[LoggingHandler()],
)
#### /print debug information to stdout
dataset = "quora"
url = (
f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
)
out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
#### Loading test queries and corpus in DBPedia
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
corpus_ids, query_ids = list(corpus), list(queries)
#### Get corpus texts
corpus_texts = [remove_stopwords(corpus[corpus_id]["text"]) for corpus_id in corpus_ids]
#### Retrieve results for all queries
results = {}
time_taken_all = {}
dataset_size = 100
for query_id in tqdm(query_ids[:dataset_size]):
query = remove_stopwords(queries[query_id])
#### Measure time to retrieve top-100 using difflib
start = datetime.datetime.now()
matches = difflib.get_close_matches(query, corpus_texts, n=100, cutoff=0)
end = datetime.datetime.now()
#### Map matched texts back to corpus IDs with similarity scores
results[query_id] = {}
for match in matches:
idx = corpus_texts.index(match)
corpus_id = corpus_ids[idx]
score = difflib.SequenceMatcher(None, query, match).ratio()
results[query_id][corpus_id] = score
#### Measuring time taken in ms (milliseconds)
time_taken = end - start
time_taken = time_taken.total_seconds() * 1000
time_taken_all[query_id] = time_taken
time_taken = list(time_taken_all.values())
logging.info(f"Average time taken: {sum(time_taken) / len(time_taken_all):.2f}ms")
#### Evaluate retrieval performance
evaluator = EvaluateRetrieval()
ndcg, map_score, recall, precision = evaluator.evaluate(
qrels, results, evaluator.k_values
)
logging.info(f"NDCG@10: {ndcg['NDCG@10']:.4f}")
logging.info(f"MAP@100: {map_score['MAP@100']:.4f}")
logging.info(f"Recall@100: {recall['Recall@100']:.4f}")
# 100%|████████████████████████████████████████████████████████████████| 100/100 [25:26<00:00, 15.27s/it]
# 2026-02-22 16:03:56 - Average time taken: 15120.29ms
# 2026-02-22 16:03:56 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
# 2026-02-22 16:03:56 -
# 2026-02-22 16:03:56 - NDCG@1: 0.5600
# 2026-02-22 16:03:56 - NDCG@3: 0.5594
# 2026-02-22 16:03:56 - NDCG@5: 0.5642
# 2026-02-22 16:03:56 - NDCG@10: 0.5709
# 2026-02-22 16:03:56 - NDCG@100: 0.6095
# 2026-02-22 16:03:56 - NDCG@1000: 0.6095
# 2026-02-22 16:03:56 -
# 2026-02-22 16:03:56 - MAP@1: 0.4392
# 2026-02-22 16:03:56 - MAP@3: 0.4951
# 2026-02-22 16:03:56 - MAP@5: 0.5064
# 2026-02-22 16:03:56 - MAP@10: 0.5136
# 2026-02-22 16:03:56 - MAP@100: 0.5246
# 2026-02-22 16:03:56 - MAP@1000: 0.5246
# 2026-02-22 16:03:56 -
# 2026-02-22 16:03:56 - Recall@1: 0.4392
# 2026-02-22 16:03:56 - Recall@3: 0.5543
# 2026-02-22 16:03:56 - Recall@5: 0.5986
# 2026-02-22 16:03:56 - Recall@10: 0.6342
# 2026-02-22 16:03:56 - Recall@100: 0.7652
# 2026-02-22 16:03:56 - Recall@1000: 0.7652
# 2026-02-22 16:03:56 -
# 2026-02-22 16:03:56 - P@1: 0.5600
# 2026-02-22 16:03:56 - P@3: 0.2533
# 2026-02-22 16:03:56 - P@5: 0.1680
# 2026-02-22 16:03:56 - P@10: 0.0940
# 2026-02-22 16:03:56 - P@100: 0.0147
# 2026-02-22 16:03:56 - P@1000: 0.0015
# 2026-02-22 16:03:56 - NDCG@10: 0.5709
# 2026-02-22 16:03:56 - MAP@100: 0.5246
# 2026-02-22 16:03:56 - Recall@100: 0.7652
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment