Skip to content

Instantly share code, notes, and snippets.

@Randl
Created January 8, 2026 14:20
Show Gist options
  • Select an option

  • Save Randl/a4d17dc0ca1c5f064fb9e2e2e9af31a3 to your computer and use it in GitHub Desktop.

Select an option

Save Randl/a4d17dc0ca1c5f064fb9e2e2e9af31a3 to your computer and use it in GitHub Desktop.
N gram matching
import os
from collections import defaultdict
from datasets import load_dataset
from tqdm import tqdm
N_GRAM_SIZE = 13
PARQUET_FOLDER = "base_data"
TARGET_TEXT_COL = "text"
def get_ngrams(text, n):
if not text: return []
tokens = text.lower().split()
if len(tokens) < n: return []
return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
def format_hellaswag(row):
"""Reconstruct full text: Context + Correct Ending"""
if 'label' not in row or row['label'] == '':
return row['ctx']
correct_index = int(row['label'])
return f"{row['ctx']} {row['endings'][correct_index]}"
def find_and_show_matches():
# 1. Load Datasets
print("Loading Hellaswag (Validation)...")
ds_source = load_dataset("hellaswag", split="validation")
parquet_files = [os.path.join(PARQUET_FOLDER, x) for x in os.listdir(PARQUET_FOLDER) if x.endswith(".parquet")]
print(f"Loading {parquet_files}...")
ds_target = load_dataset("parquet", data_files=parquet_files, split="train")
# Map: { ngram_tuple : set( [hellaswag_text_1, hellaswag_text_2] ) }
source_map = defaultdict(set)
print(f"Indexing Hellaswag {N_GRAM_SIZE}-grams...")
for row in tqdm(ds_source):
full_text = format_hellaswag(row)
ngrams = get_ngrams(full_text, N_GRAM_SIZE)
for ng in ngrams:
source_map[ng].add((row['ind'], full_text))
matches = []
print("Scanning Parquet file for exact matches...")
for row in tqdm(ds_target):
target_text = row.get(TARGET_TEXT_COL, "")
ngrams = get_ngrams(target_text, N_GRAM_SIZE)
for ng in ngrams:
if ng in source_map:
# Retrieve the original Hellaswag sentences that have this ngram
matching_source_sentences = source_map[ng]
for (ind, src_sent) in matching_source_sentences:
matches.append({
"ngram": " ".join(ng),
"hellaswag_ind": ind,
"hellaswag_row": src_sent,
"parquet_row": target_text
})
# leave only unique parquet_rows
unique_matches = {}
for m in matches:
key = m['parquet_row']
unique_matches[key] = m
matches = list(unique_matches.values())
# 4. Display Results
print("\n" + "=" * 50)
print(f"FOUND {len(matches)}/{len(ds_target)} MATCHES")
print("=" * 50)
for i, m in enumerate(matches):
print(f"\n--- MATCH #{i + 1} ---")
print(f"SHARED N-GRAM : '{m['ngram']}'")
print(f"HELLASWAG ROW {m['hellaswag_ind']}: ...{m['hellaswag_row'][:100]}...")
print(f"PARQUET ROW : ...{m['parquet_row'][:100]}...")
if len(matches) > 0:
import pandas as pd
df = pd.DataFrame(matches)
output_file = "overlap_matches.csv"
df.to_csv(output_file, index=False)
print(f"\nAll matches saved to '{output_file}'")
print("\n" + "=" * 50)
print(f"Match indices: {[m['hellaswag_ind'] for m in matches]}")
if __name__ == "__main__":
find_and_show_matches()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment