Created
February 22, 2026 05:09
-
-
Save NoRaincheck/ed5c5db72f34b3eb6e8566b22d0c7788 to your computer and use it in GitHub Desktop.
Microbenchmark of bm25 and difflib. difflib is much, much slower, but probably 'good enough' for small contexts/corpuses. Stop words makes a big difference with speed and performance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.14" | |
| # dependencies = [ | |
| # "beir>=2.2.0", | |
| # "rank-bm25>=0.2.2", | |
| # ] | |
| # /// | |
| import datetime | |
| import logging | |
| import os | |
| import pathlib | |
| from beir import LoggingHandler, util | |
| from beir.datasets.data_loader import GenericDataLoader | |
| from beir.retrieval.evaluation import EvaluateRetrieval | |
| from rank_bm25 import BM25Okapi | |
| from tqdm import tqdm | |
| #### Just some code to print debug information to stdout | |
| logging.basicConfig( | |
| format="%(asctime)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| level=logging.INFO, | |
| handlers=[LoggingHandler()], | |
| ) | |
| #### /print debug information to stdout | |
| dataset = "quora" | |
| url = ( | |
| f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip" | |
| ) | |
| out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets") | |
| data_path = util.download_and_unzip(url, out_dir) | |
| #### Loading test queries and corpus in DBPedia | |
| corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") | |
| corpus_ids, query_ids = list(corpus), list(queries) | |
| #### Tokenize corpus | |
| corpus_texts = [corpus[corpus_id]["text"] for corpus_id in corpus_ids] | |
| tokenized_corpus = [doc.split(" ") for doc in corpus_texts] | |
| #### Index all passages into BM25 | |
| bm25 = BM25Okapi(tokenized_corpus) | |
| #### Retrieve results for all queries | |
| results = {} | |
| time_taken_all = {} | |
| dataset_size = 100 | |
| for query_id in tqdm(query_ids[:dataset_size]): | |
| query = queries[query_id] | |
| #### Measure time to retrieve top-100 BM25 documents | |
| start = datetime.datetime.now() | |
| tokenized_query = query.split(" ") | |
| scores = bm25.get_scores(tokenized_query) | |
| top_n_indices = scores.argsort()[-100:][::-1] | |
| end = datetime.datetime.now() | |
| #### Map indices back to corpus IDs | |
| results[query_id] = {corpus_ids[idx]: float(scores[idx]) for idx in top_n_indices} | |
| #### Measuring time taken in ms (milliseconds) | |
| time_taken = end - start | |
| time_taken = time_taken.total_seconds() * 1000 | |
| time_taken_all[query_id] = time_taken | |
| # logging.info(f"{query_id}: {query} {time_taken:.2f}ms") | |
| time_taken = list(time_taken_all.values()) | |
| logging.info(f"Average time taken: {sum(time_taken) / len(time_taken_all):.2f}ms") | |
| #### Evaluate retrieval performance | |
| evaluator = EvaluateRetrieval() | |
| ndcg, map_score, recall, precision = evaluator.evaluate( | |
| qrels, results, evaluator.k_values | |
| ) | |
| logging.info(f"NDCG@10: {ndcg['NDCG@10']:.4f}") | |
| logging.info(f"MAP@100: {map_score['MAP@100']:.4f}") | |
| logging.info(f"Recall@100: {recall['Recall@100']:.4f}") | |
| # 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:34<00:00, 2.88it/s] | |
| # 2026-02-22 16:05:51 - Average time taken: 347.32ms | |
| # 2026-02-22 16:05:51 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this. | |
| # 2026-02-22 16:05:51 - | |
| # 2026-02-22 16:05:51 - NDCG@1: 0.5000 | |
| # 2026-02-22 16:05:51 - NDCG@3: 0.5508 | |
| # 2026-02-22 16:05:51 - NDCG@5: 0.5712 | |
| # 2026-02-22 16:05:51 - NDCG@10: 0.5877 | |
| # 2026-02-22 16:05:51 - NDCG@100: 0.6257 | |
| # 2026-02-22 16:05:51 - NDCG@1000: 0.6257 | |
| # 2026-02-22 16:05:51 - | |
| # 2026-02-22 16:05:51 - MAP@1: 0.3982 | |
| # 2026-02-22 16:05:51 - MAP@3: 0.4972 | |
| # 2026-02-22 16:05:51 - MAP@5: 0.5161 | |
| # 2026-02-22 16:05:51 - MAP@10: 0.5302 | |
| # 2026-02-22 16:05:51 - MAP@100: 0.5415 | |
| # 2026-02-22 16:05:51 - MAP@1000: 0.5415 | |
| # 2026-02-22 16:05:51 - | |
| # 2026-02-22 16:05:51 - Recall@1: 0.3982 | |
| # 2026-02-22 16:05:51 - Recall@3: 0.5725 | |
| # 2026-02-22 16:05:51 - Recall@5: 0.6299 | |
| # 2026-02-22 16:05:51 - Recall@10: 0.6869 | |
| # 2026-02-22 16:05:51 - Recall@100: 0.8259 | |
| # 2026-02-22 16:05:51 - Recall@1000: 0.8259 | |
| # 2026-02-22 16:05:51 - | |
| # 2026-02-22 16:05:51 - P@1: 0.5000 | |
| # 2026-02-22 16:05:51 - P@3: 0.2600 | |
| # 2026-02-22 16:05:51 - P@5: 0.1840 | |
| # 2026-02-22 16:05:51 - P@10: 0.1060 | |
| # 2026-02-22 16:05:51 - P@100: 0.0153 | |
| # 2026-02-22 16:05:51 - P@1000: 0.0015 | |
| # 2026-02-22 16:05:51 - NDCG@10: 0.5877 | |
| # 2026-02-22 16:05:51 - MAP@100: 0.5415 | |
| # 2026-02-22 16:05:51 - Recall@100: 0.8259 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.14" | |
| # dependencies = [ | |
| # "beir>=2.2.0", | |
| # ] | |
| # /// | |
| import datetime | |
| import difflib | |
| import logging | |
| import os | |
| import pathlib | |
| import string | |
| from beir import LoggingHandler, util | |
| from beir.datasets.data_loader import GenericDataLoader | |
| from beir.retrieval.evaluation import EvaluateRetrieval | |
| from tqdm import tqdm | |
| STOP_WORDS = frozenset( | |
| [ | |
| "a", | |
| "about", | |
| "above", | |
| "across", | |
| "after", | |
| "afterwards", | |
| "again", | |
| "against", | |
| "all", | |
| "almost", | |
| "alone", | |
| "along", | |
| "already", | |
| "also", | |
| "although", | |
| "always", | |
| "am", | |
| "among", | |
| "amongst", | |
| "amoungst", | |
| "amount", | |
| "an", | |
| "and", | |
| "another", | |
| "any", | |
| "anyhow", | |
| "anyone", | |
| "anything", | |
| "anyway", | |
| "anywhere", | |
| "are", | |
| "around", | |
| "as", | |
| "at", | |
| "back", | |
| "be", | |
| "became", | |
| "because", | |
| "become", | |
| "becomes", | |
| "becoming", | |
| "been", | |
| "before", | |
| "beforehand", | |
| "behind", | |
| "being", | |
| "below", | |
| "beside", | |
| "besides", | |
| "between", | |
| "beyond", | |
| "bill", | |
| "both", | |
| "bottom", | |
| "but", | |
| "by", | |
| "call", | |
| "can", | |
| "cannot", | |
| "cant", | |
| "co", | |
| "con", | |
| "could", | |
| "couldnt", | |
| "cry", | |
| "de", | |
| "describe", | |
| "detail", | |
| "do", | |
| "done", | |
| "down", | |
| "due", | |
| "during", | |
| "each", | |
| "eg", | |
| "eight", | |
| "either", | |
| "eleven", | |
| "else", | |
| "elsewhere", | |
| "empty", | |
| "enough", | |
| "etc", | |
| "even", | |
| "ever", | |
| "every", | |
| "everyone", | |
| "everything", | |
| "everywhere", | |
| "except", | |
| "few", | |
| "fifteen", | |
| "fifty", | |
| "fill", | |
| "find", | |
| "fire", | |
| "first", | |
| "five", | |
| "for", | |
| "former", | |
| "formerly", | |
| "forty", | |
| "found", | |
| "four", | |
| "from", | |
| "front", | |
| "full", | |
| "further", | |
| "get", | |
| "give", | |
| "go", | |
| "had", | |
| "has", | |
| "hasnt", | |
| "have", | |
| "he", | |
| "hence", | |
| "her", | |
| "here", | |
| "hereafter", | |
| "hereby", | |
| "herein", | |
| "hereupon", | |
| "hers", | |
| "herself", | |
| "him", | |
| "himself", | |
| "his", | |
| "how", | |
| "however", | |
| "hundred", | |
| "i", | |
| "ie", | |
| "if", | |
| "in", | |
| "inc", | |
| "indeed", | |
| "interest", | |
| "into", | |
| "is", | |
| "it", | |
| "its", | |
| "itself", | |
| "keep", | |
| "last", | |
| "latter", | |
| "latterly", | |
| "least", | |
| "less", | |
| "ltd", | |
| "made", | |
| "many", | |
| "may", | |
| "me", | |
| "meanwhile", | |
| "might", | |
| "mill", | |
| "mine", | |
| "more", | |
| "moreover", | |
| "most", | |
| "mostly", | |
| "move", | |
| "much", | |
| "must", | |
| "my", | |
| "myself", | |
| "name", | |
| "namely", | |
| "neither", | |
| "never", | |
| "nevertheless", | |
| "next", | |
| "nine", | |
| "no", | |
| "nobody", | |
| "none", | |
| "noone", | |
| "nor", | |
| "not", | |
| "nothing", | |
| "now", | |
| "nowhere", | |
| "of", | |
| "off", | |
| "often", | |
| "on", | |
| "once", | |
| "one", | |
| "only", | |
| "onto", | |
| "or", | |
| "other", | |
| "others", | |
| "otherwise", | |
| "our", | |
| "ours", | |
| "ourselves", | |
| "out", | |
| "over", | |
| "own", | |
| "part", | |
| "per", | |
| "perhaps", | |
| "please", | |
| "put", | |
| "rather", | |
| "re", | |
| "same", | |
| "see", | |
| "seem", | |
| "seemed", | |
| "seeming", | |
| "seems", | |
| "serious", | |
| "several", | |
| "she", | |
| "should", | |
| "show", | |
| "side", | |
| "since", | |
| "sincere", | |
| "six", | |
| "sixty", | |
| "so", | |
| "some", | |
| "somehow", | |
| "someone", | |
| "something", | |
| "sometime", | |
| "sometimes", | |
| "somewhere", | |
| "still", | |
| "such", | |
| "system", | |
| "take", | |
| "ten", | |
| "than", | |
| "that", | |
| "the", | |
| "their", | |
| "them", | |
| "themselves", | |
| "then", | |
| "thence", | |
| "there", | |
| "thereafter", | |
| "thereby", | |
| "therefore", | |
| "therein", | |
| "thereupon", | |
| "these", | |
| "they", | |
| "thick", | |
| "thin", | |
| "third", | |
| "this", | |
| "those", | |
| "though", | |
| "three", | |
| "through", | |
| "throughout", | |
| "thru", | |
| "thus", | |
| "to", | |
| "together", | |
| "too", | |
| "top", | |
| "toward", | |
| "towards", | |
| "twelve", | |
| "twenty", | |
| "two", | |
| "un", | |
| "under", | |
| "until", | |
| "up", | |
| "upon", | |
| "us", | |
| "very", | |
| "via", | |
| "was", | |
| "we", | |
| "well", | |
| "were", | |
| "what", | |
| "whatever", | |
| "when", | |
| "whence", | |
| "whenever", | |
| "where", | |
| "whereafter", | |
| "whereas", | |
| "whereby", | |
| "wherein", | |
| "whereupon", | |
| "wherever", | |
| "whether", | |
| "which", | |
| "while", | |
| "whither", | |
| "who", | |
| "whoever", | |
| "whole", | |
| "whom", | |
| "whose", | |
| "why", | |
| "will", | |
| "with", | |
| "within", | |
| "without", | |
| "would", | |
| "yet", | |
| "you", | |
| "your", | |
| "yours", | |
| "yourself", | |
| "yourselves", | |
| ] | |
| ) | |
| def remove_stopwords(text): | |
| words = text.lower().translate(str.maketrans("", "", string.punctuation)).split() | |
| return " ".join(w for w in words if w not in STOP_WORDS) | |
| #### Just some code to print debug information to stdout | |
| logging.basicConfig( | |
| format="%(asctime)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| level=logging.INFO, | |
| handlers=[LoggingHandler()], | |
| ) | |
| #### /print debug information to stdout | |
| dataset = "quora" | |
| url = ( | |
| f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip" | |
| ) | |
| out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets") | |
| data_path = util.download_and_unzip(url, out_dir) | |
| #### Loading test queries and corpus in DBPedia | |
| corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") | |
| corpus_ids, query_ids = list(corpus), list(queries) | |
| #### Get corpus texts | |
| corpus_texts = [remove_stopwords(corpus[corpus_id]["text"]) for corpus_id in corpus_ids] | |
| #### Retrieve results for all queries | |
| results = {} | |
| time_taken_all = {} | |
| dataset_size = 100 | |
| for query_id in tqdm(query_ids[:dataset_size]): | |
| query = remove_stopwords(queries[query_id]) | |
| #### Measure time to retrieve top-100 using difflib | |
| start = datetime.datetime.now() | |
| matches = difflib.get_close_matches(query, corpus_texts, n=100, cutoff=0) | |
| end = datetime.datetime.now() | |
| #### Map matched texts back to corpus IDs with similarity scores | |
| results[query_id] = {} | |
| for match in matches: | |
| idx = corpus_texts.index(match) | |
| corpus_id = corpus_ids[idx] | |
| score = difflib.SequenceMatcher(None, query, match).ratio() | |
| results[query_id][corpus_id] = score | |
| #### Measuring time taken in ms (milliseconds) | |
| time_taken = end - start | |
| time_taken = time_taken.total_seconds() * 1000 | |
| time_taken_all[query_id] = time_taken | |
| time_taken = list(time_taken_all.values()) | |
| logging.info(f"Average time taken: {sum(time_taken) / len(time_taken_all):.2f}ms") | |
| #### Evaluate retrieval performance | |
| evaluator = EvaluateRetrieval() | |
| ndcg, map_score, recall, precision = evaluator.evaluate( | |
| qrels, results, evaluator.k_values | |
| ) | |
| logging.info(f"NDCG@10: {ndcg['NDCG@10']:.4f}") | |
| logging.info(f"MAP@100: {map_score['MAP@100']:.4f}") | |
| logging.info(f"Recall@100: {recall['Recall@100']:.4f}") | |
| # 100%|████████████████████████████████████████████████████████████████| 100/100 [25:26<00:00, 15.27s/it] | |
| # 2026-02-22 16:03:56 - Average time taken: 15120.29ms | |
| # 2026-02-22 16:03:56 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this. | |
| # 2026-02-22 16:03:56 - | |
| # 2026-02-22 16:03:56 - NDCG@1: 0.5600 | |
| # 2026-02-22 16:03:56 - NDCG@3: 0.5594 | |
| # 2026-02-22 16:03:56 - NDCG@5: 0.5642 | |
| # 2026-02-22 16:03:56 - NDCG@10: 0.5709 | |
| # 2026-02-22 16:03:56 - NDCG@100: 0.6095 | |
| # 2026-02-22 16:03:56 - NDCG@1000: 0.6095 | |
| # 2026-02-22 16:03:56 - | |
| # 2026-02-22 16:03:56 - MAP@1: 0.4392 | |
| # 2026-02-22 16:03:56 - MAP@3: 0.4951 | |
| # 2026-02-22 16:03:56 - MAP@5: 0.5064 | |
| # 2026-02-22 16:03:56 - MAP@10: 0.5136 | |
| # 2026-02-22 16:03:56 - MAP@100: 0.5246 | |
| # 2026-02-22 16:03:56 - MAP@1000: 0.5246 | |
| # 2026-02-22 16:03:56 - | |
| # 2026-02-22 16:03:56 - Recall@1: 0.4392 | |
| # 2026-02-22 16:03:56 - Recall@3: 0.5543 | |
| # 2026-02-22 16:03:56 - Recall@5: 0.5986 | |
| # 2026-02-22 16:03:56 - Recall@10: 0.6342 | |
| # 2026-02-22 16:03:56 - Recall@100: 0.7652 | |
| # 2026-02-22 16:03:56 - Recall@1000: 0.7652 | |
| # 2026-02-22 16:03:56 - | |
| # 2026-02-22 16:03:56 - P@1: 0.5600 | |
| # 2026-02-22 16:03:56 - P@3: 0.2533 | |
| # 2026-02-22 16:03:56 - P@5: 0.1680 | |
| # 2026-02-22 16:03:56 - P@10: 0.0940 | |
| # 2026-02-22 16:03:56 - P@100: 0.0147 | |
| # 2026-02-22 16:03:56 - P@1000: 0.0015 | |
| # 2026-02-22 16:03:56 - NDCG@10: 0.5709 | |
| # 2026-02-22 16:03:56 - MAP@100: 0.5246 | |
| # 2026-02-22 16:03:56 - Recall@100: 0.7652 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment