NoRaincheck/benchmark_bm25.py

## benchmark_bm25.py
# /// script
# requires-python = ">=3.14"
# dependencies = [
#     "beir>=2.2.0",
#     "rank-bm25>=0.2.2",
# ]
# ///
import datetime
import logging
import os
import pathlib

from beir import LoggingHandler, util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from rank_bm25 import BM25Okapi
from tqdm import tqdm

#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

#### /print debug information to stdout
dataset = "quora"
url = (
    f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
)
out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

#### Loading test queries and corpus in DBPedia
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
corpus_ids, query_ids = list(corpus), list(queries)

#### Tokenize corpus
corpus_texts = [corpus[corpus_id]["text"] for corpus_id in corpus_ids]
tokenized_corpus = [doc.split(" ") for doc in corpus_texts]

#### Index all passages into BM25
bm25 = BM25Okapi(tokenized_corpus)

#### Retrieve results for all queries
results = {}
time_taken_all = {}

dataset_size = 100

for query_id in tqdm(query_ids[:dataset_size]):
    query = queries[query_id]

    #### Measure time to retrieve top-100 BM25 documents
    start = datetime.datetime.now()
    tokenized_query = query.split(" ")
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = scores.argsort()[-100:][::-1]
    end = datetime.datetime.now()

    #### Map indices back to corpus IDs
    results[query_id] = {corpus_ids[idx]: float(scores[idx]) for idx in top_n_indices}

    #### Measuring time taken in ms (milliseconds)
    time_taken = end - start
    time_taken = time_taken.total_seconds() * 1000
    time_taken_all[query_id] = time_taken
    # logging.info(f"{query_id}: {query} {time_taken:.2f}ms")

time_taken = list(time_taken_all.values())
logging.info(f"Average time taken: {sum(time_taken) / len(time_taken_all):.2f}ms")

#### Evaluate retrieval performance
evaluator = EvaluateRetrieval()
ndcg, map_score, recall, precision = evaluator.evaluate(
    qrels, results, evaluator.k_values
)
logging.info(f"NDCG@10: {ndcg['NDCG@10']:.4f}")
logging.info(f"MAP@100: {map_score['MAP@100']:.4f}")
logging.info(f"Recall@100: {recall['Recall@100']:.4f}")

# 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:34<00:00,  2.88it/s]
# 2026-02-22 16:05:51 - Average time taken: 347.32ms
# 2026-02-22 16:05:51 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
# 2026-02-22 16:05:51 -

# 2026-02-22 16:05:51 - NDCG@1: 0.5000
# 2026-02-22 16:05:51 - NDCG@3: 0.5508
# 2026-02-22 16:05:51 - NDCG@5: 0.5712
# 2026-02-22 16:05:51 - NDCG@10: 0.5877
# 2026-02-22 16:05:51 - NDCG@100: 0.6257
# 2026-02-22 16:05:51 - NDCG@1000: 0.6257
# 2026-02-22 16:05:51 -

# 2026-02-22 16:05:51 - MAP@1: 0.3982
# 2026-02-22 16:05:51 - MAP@3: 0.4972
# 2026-02-22 16:05:51 - MAP@5: 0.5161
# 2026-02-22 16:05:51 - MAP@10: 0.5302
# 2026-02-22 16:05:51 - MAP@100: 0.5415
# 2026-02-22 16:05:51 - MAP@1000: 0.5415
# 2026-02-22 16:05:51 -

# 2026-02-22 16:05:51 - Recall@1: 0.3982
# 2026-02-22 16:05:51 - Recall@3: 0.5725
# 2026-02-22 16:05:51 - Recall@5: 0.6299
# 2026-02-22 16:05:51 - Recall@10: 0.6869
# 2026-02-22 16:05:51 - Recall@100: 0.8259
# 2026-02-22 16:05:51 - Recall@1000: 0.8259
# 2026-02-22 16:05:51 -

# 2026-02-22 16:05:51 - P@1: 0.5000
# 2026-02-22 16:05:51 - P@3: 0.2600
# 2026-02-22 16:05:51 - P@5: 0.1840
# 2026-02-22 16:05:51 - P@10: 0.1060
# 2026-02-22 16:05:51 - P@100: 0.0153
# 2026-02-22 16:05:51 - P@1000: 0.0015
# 2026-02-22 16:05:51 - NDCG@10: 0.5877
# 2026-02-22 16:05:51 - MAP@100: 0.5415
# 2026-02-22 16:05:51 - Recall@100: 0.8259

## benchmark_difflib.py
# /// script
# requires-python = ">=3.14"
# dependencies = [
#     "beir>=2.2.0",
# ]
# ///
import datetime
import difflib
import logging
import os
import pathlib
import string

from beir import LoggingHandler, util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from tqdm import tqdm

STOP_WORDS = frozenset(
    [
        "a",
        "about",
        "above",
        "across",
        "after",
        "afterwards",
        "again",
        "against",
        "all",
        "almost",
        "alone",
        "along",
        "already",
        "also",
        "although",
        "always",
        "am",
        "among",
        "amongst",
        "amoungst",
        "amount",
        "an",
        "and",
        "another",
        "any",
        "anyhow",
        "anyone",
        "anything",
        "anyway",
        "anywhere",
        "are",
        "around",
        "as",
        "at",
        "back",
        "be",
        "became",
        "because",
        "become",
        "becomes",
        "becoming",
        "been",
        "before",
        "beforehand",
        "behind",
        "being",
        "below",
        "beside",
        "besides",
        "between",
        "beyond",
        "bill",
        "both",
        "bottom",
        "but",
        "by",
        "call",
        "can",
        "cannot",
        "cant",
        "co",
        "con",
        "could",
        "couldnt",
        "cry",
        "de",
        "describe",
        "detail",
        "do",
        "done",
        "down",
        "due",
        "during",
        "each",
        "eg",
        "eight",
        "either",
        "eleven",
        "else",
        "elsewhere",
        "empty",
        "enough",
        "etc",
        "even",
        "ever",
        "every",
        "everyone",
        "everything",
        "everywhere",
        "except",
        "few",
        "fifteen",
        "fifty",
        "fill",
        "find",
        "fire",
        "first",
        "five",
        "for",
        "former",
        "formerly",
        "forty",
        "found",
        "four",
        "from",
        "front",
        "full",
        "further",
        "get",
        "give",
        "go",
        "had",
        "has",
        "hasnt",
        "have",
        "he",
        "hence",
        "her",
        "here",
        "hereafter",
        "hereby",
        "herein",
        "hereupon",
        "hers",
        "herself",
        "him",
        "himself",
        "his",
        "how",
        "however",
        "hundred",
        "i",
        "ie",
        "if",
        "in",
        "inc",
        "indeed",
        "interest",
        "into",
        "is",
        "it",
        "its",
        "itself",
        "keep",
        "last",
        "latter",
        "latterly",
        "least",
        "less",
        "ltd",
        "made",
        "many",
        "may",
        "me",
        "meanwhile",
        "might",
        "mill",
        "mine",
        "more",
        "moreover",
        "most",
        "mostly",
        "move",
        "much",
        "must",
        "my",
        "myself",
        "name",
        "namely",
        "neither",
        "never",
        "nevertheless",
        "next",
        "nine",
        "no",
        "nobody",
        "none",
        "noone",
        "nor",
        "not",
        "nothing",
        "now",
        "nowhere",
        "of",
        "off",
        "often",
        "on",
        "once",
        "one",
        "only",
        "onto",
        "or",
        "other",
        "others",
        "otherwise",
        "our",
        "ours",
        "ourselves",
        "out",
        "over",
        "own",
        "part",
        "per",
        "perhaps",
        "please",
        "put",
        "rather",
        "re",
        "same",
        "see",
        "seem",
        "seemed",
        "seeming",
        "seems",
        "serious",
        "several",
        "she",
        "should",
        "show",
        "side",
        "since",
        "sincere",
        "six",
        "sixty",
        "so",
        "some",
        "somehow",
        "someone",
        "something",
        "sometime",
        "sometimes",
        "somewhere",
        "still",
        "such",
        "system",
        "take",
        "ten",
        "than",
        "that",
        "the",
        "their",
        "them",
        "themselves",
        "then",
        "thence",
        "there",
        "thereafter",
        "thereby",
        "therefore",
        "therein",
        "thereupon",
        "these",
        "they",
        "thick",
        "thin",
        "third",
        "this",
        "those",
        "though",
        "three",
        "through",
        "throughout",
        "thru",
        "thus",
        "to",
        "together",
        "too",
        "top",
        "toward",
        "towards",
        "twelve",
        "twenty",
        "two",
        "un",
        "under",
        "until",
        "up",
        "upon",
        "us",
        "very",
        "via",
        "was",
        "we",
        "well",
        "were",
        "what",
        "whatever",
        "when",
        "whence",
        "whenever",
        "where",
        "whereafter",
        "whereas",
        "whereby",
        "wherein",
        "whereupon",
        "wherever",
        "whether",
        "which",
        "while",
        "whither",
        "who",
        "whoever",
        "whole",
        "whom",
        "whose",
        "why",
        "will",
        "with",
        "within",
        "without",
        "would",
        "yet",
        "you",
        "your",
        "yours",
        "yourself",
        "yourselves",
    ]
)


def remove_stopwords(text):
    words = text.lower().translate(str.maketrans("", "", string.punctuation)).split()
    return " ".join(w for w in words if w not in STOP_WORDS)


#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

#### /print debug information to stdout
dataset = "quora"
url = (
    f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
)
out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

#### Loading test queries and corpus in DBPedia
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
corpus_ids, query_ids = list(corpus), list(queries)

#### Get corpus texts
corpus_texts = [remove_stopwords(corpus[corpus_id]["text"]) for corpus_id in corpus_ids]

#### Retrieve results for all queries
results = {}
time_taken_all = {}

dataset_size = 100

for query_id in tqdm(query_ids[:dataset_size]):
    query = remove_stopwords(queries[query_id])

    #### Measure time to retrieve top-100 using difflib
    start = datetime.datetime.now()
    matches = difflib.get_close_matches(query, corpus_texts, n=100, cutoff=0)
    end = datetime.datetime.now()

    #### Map matched texts back to corpus IDs with similarity scores
    results[query_id] = {}
    for match in matches:
        idx = corpus_texts.index(match)
        corpus_id = corpus_ids[idx]
        score = difflib.SequenceMatcher(None, query, match).ratio()
        results[query_id][corpus_id] = score

    #### Measuring time taken in ms (milliseconds)
    time_taken = end - start
    time_taken = time_taken.total_seconds() * 1000
    time_taken_all[query_id] = time_taken

time_taken = list(time_taken_all.values())
logging.info(f"Average time taken: {sum(time_taken) / len(time_taken_all):.2f}ms")

#### Evaluate retrieval performance
evaluator = EvaluateRetrieval()
ndcg, map_score, recall, precision = evaluator.evaluate(
    qrels, results, evaluator.k_values
)
logging.info(f"NDCG@10: {ndcg['NDCG@10']:.4f}")
logging.info(f"MAP@100: {map_score['MAP@100']:.4f}")
logging.info(f"Recall@100: {recall['Recall@100']:.4f}")

# 100%|████████████████████████████████████████████████████████████████| 100/100 [25:26<00:00, 15.27s/it]
# 2026-02-22 16:03:56 - Average time taken: 15120.29ms
# 2026-02-22 16:03:56 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
# 2026-02-22 16:03:56 -

# 2026-02-22 16:03:56 - NDCG@1: 0.5600
# 2026-02-22 16:03:56 - NDCG@3: 0.5594
# 2026-02-22 16:03:56 - NDCG@5: 0.5642
# 2026-02-22 16:03:56 - NDCG@10: 0.5709
# 2026-02-22 16:03:56 - NDCG@100: 0.6095
# 2026-02-22 16:03:56 - NDCG@1000: 0.6095
# 2026-02-22 16:03:56 -

# 2026-02-22 16:03:56 - MAP@1: 0.4392
# 2026-02-22 16:03:56 - MAP@3: 0.4951
# 2026-02-22 16:03:56 - MAP@5: 0.5064
# 2026-02-22 16:03:56 - MAP@10: 0.5136
# 2026-02-22 16:03:56 - MAP@100: 0.5246
# 2026-02-22 16:03:56 - MAP@1000: 0.5246
# 2026-02-22 16:03:56 -

# 2026-02-22 16:03:56 - Recall@1: 0.4392
# 2026-02-22 16:03:56 - Recall@3: 0.5543
# 2026-02-22 16:03:56 - Recall@5: 0.5986
# 2026-02-22 16:03:56 - Recall@10: 0.6342
# 2026-02-22 16:03:56 - Recall@100: 0.7652
# 2026-02-22 16:03:56 - Recall@1000: 0.7652
# 2026-02-22 16:03:56 -

# 2026-02-22 16:03:56 - P@1: 0.5600
# 2026-02-22 16:03:56 - P@3: 0.2533
# 2026-02-22 16:03:56 - P@5: 0.1680
# 2026-02-22 16:03:56 - P@10: 0.0940
# 2026-02-22 16:03:56 - P@100: 0.0147
# 2026-02-22 16:03:56 - P@1000: 0.0015
# 2026-02-22 16:03:56 - NDCG@10: 0.5709
# 2026-02-22 16:03:56 - MAP@100: 0.5246
# 2026-02-22 16:03:56 - Recall@100: 0.7652
	# /// script
	# requires-python = ">=3.14"
	# dependencies = [
	# "beir>=2.2.0",
	# "rank-bm25>=0.2.2",
	# ]
	# ///
	import datetime
	import logging
	import os
	import pathlib

	from beir import LoggingHandler, util
	from beir.datasets.data_loader import GenericDataLoader
	from beir.retrieval.evaluation import EvaluateRetrieval
	from rank_bm25 import BM25Okapi
	from tqdm import tqdm

	#### Just some code to print debug information to stdout
	logging.basicConfig(
	format="%(asctime)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	level=logging.INFO,
	handlers=[LoggingHandler()],
	)

	#### /print debug information to stdout
	dataset = "quora"
	url = (
	f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
	)
	out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
	data_path = util.download_and_unzip(url, out_dir)

	#### Loading test queries and corpus in DBPedia
	corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
	corpus_ids, query_ids = list(corpus), list(queries)

	#### Tokenize corpus
	corpus_texts = [corpus[corpus_id]["text"] for corpus_id in corpus_ids]
	tokenized_corpus = [doc.split(" ") for doc in corpus_texts]

	#### Index all passages into BM25
	bm25 = BM25Okapi(tokenized_corpus)

	#### Retrieve results for all queries
	results = {}
	time_taken_all = {}

	dataset_size = 100

	for query_id in tqdm(query_ids[:dataset_size]):
	query = queries[query_id]

	#### Measure time to retrieve top-100 BM25 documents
	start = datetime.datetime.now()
	tokenized_query = query.split(" ")
	scores = bm25.get_scores(tokenized_query)
	top_n_indices = scores.argsort()[-100:][::-1]
	end = datetime.datetime.now()

	#### Map indices back to corpus IDs
	results[query_id] = {corpus_ids[idx]: float(scores[idx]) for idx in top_n_indices}

	#### Measuring time taken in ms (milliseconds)
	time_taken = end - start
	time_taken = time_taken.total_seconds() * 1000
	time_taken_all[query_id] = time_taken
	# logging.info(f"{query_id}: {query} {time_taken:.2f}ms")

	time_taken = list(time_taken_all.values())
	logging.info(f"Average time taken: {sum(time_taken) / len(time_taken_all):.2f}ms")

	#### Evaluate retrieval performance
	evaluator = EvaluateRetrieval()
	ndcg, map_score, recall, precision = evaluator.evaluate(
	qrels, results, evaluator.k_values
	)
	logging.info(f"NDCG@10: {ndcg['NDCG@10']:.4f}")
	logging.info(f"MAP@100: {map_score['MAP@100']:.4f}")
	logging.info(f"Recall@100: {recall['Recall@100']:.4f}")

	# 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 100/100 [00:34<00:00, 2.88it/s]
	# 2026-02-22 16:05:51 - Average time taken: 347.32ms
	# 2026-02-22 16:05:51 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
	# 2026-02-22 16:05:51 -

	# 2026-02-22 16:05:51 - NDCG@1: 0.5000
	# 2026-02-22 16:05:51 - NDCG@3: 0.5508
	# 2026-02-22 16:05:51 - NDCG@5: 0.5712
	# 2026-02-22 16:05:51 - NDCG@10: 0.5877
	# 2026-02-22 16:05:51 - NDCG@100: 0.6257
	# 2026-02-22 16:05:51 - NDCG@1000: 0.6257
	# 2026-02-22 16:05:51 -

	# 2026-02-22 16:05:51 - MAP@1: 0.3982
	# 2026-02-22 16:05:51 - MAP@3: 0.4972
	# 2026-02-22 16:05:51 - MAP@5: 0.5161
	# 2026-02-22 16:05:51 - MAP@10: 0.5302
	# 2026-02-22 16:05:51 - MAP@100: 0.5415
	# 2026-02-22 16:05:51 - MAP@1000: 0.5415
	# 2026-02-22 16:05:51 -

	# 2026-02-22 16:05:51 - Recall@1: 0.3982
	# 2026-02-22 16:05:51 - Recall@3: 0.5725
	# 2026-02-22 16:05:51 - Recall@5: 0.6299
	# 2026-02-22 16:05:51 - Recall@10: 0.6869
	# 2026-02-22 16:05:51 - Recall@100: 0.8259
	# 2026-02-22 16:05:51 - Recall@1000: 0.8259
	# 2026-02-22 16:05:51 -

	# 2026-02-22 16:05:51 - P@1: 0.5000
	# 2026-02-22 16:05:51 - P@3: 0.2600
	# 2026-02-22 16:05:51 - P@5: 0.1840
	# 2026-02-22 16:05:51 - P@10: 0.1060
	# 2026-02-22 16:05:51 - P@100: 0.0153
	# 2026-02-22 16:05:51 - P@1000: 0.0015
	# 2026-02-22 16:05:51 - NDCG@10: 0.5877
	# 2026-02-22 16:05:51 - MAP@100: 0.5415
	# 2026-02-22 16:05:51 - Recall@100: 0.8259
No results found