Created
January 8, 2026 14:20
-
-
Save Randl/a4d17dc0ca1c5f064fb9e2e2e9af31a3 to your computer and use it in GitHub Desktop.
N gram matching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from collections import defaultdict | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| N_GRAM_SIZE = 13 | |
| PARQUET_FOLDER = "base_data" | |
| TARGET_TEXT_COL = "text" | |
| def get_ngrams(text, n): | |
| if not text: return [] | |
| tokens = text.lower().split() | |
| if len(tokens) < n: return [] | |
| return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)] | |
| def format_hellaswag(row): | |
| """Reconstruct full text: Context + Correct Ending""" | |
| if 'label' not in row or row['label'] == '': | |
| return row['ctx'] | |
| correct_index = int(row['label']) | |
| return f"{row['ctx']} {row['endings'][correct_index]}" | |
| def find_and_show_matches(): | |
| # 1. Load Datasets | |
| print("Loading Hellaswag (Validation)...") | |
| ds_source = load_dataset("hellaswag", split="validation") | |
| parquet_files = [os.path.join(PARQUET_FOLDER, x) for x in os.listdir(PARQUET_FOLDER) if x.endswith(".parquet")] | |
| print(f"Loading {parquet_files}...") | |
| ds_target = load_dataset("parquet", data_files=parquet_files, split="train") | |
| # Map: { ngram_tuple : set( [hellaswag_text_1, hellaswag_text_2] ) } | |
| source_map = defaultdict(set) | |
| print(f"Indexing Hellaswag {N_GRAM_SIZE}-grams...") | |
| for row in tqdm(ds_source): | |
| full_text = format_hellaswag(row) | |
| ngrams = get_ngrams(full_text, N_GRAM_SIZE) | |
| for ng in ngrams: | |
| source_map[ng].add((row['ind'], full_text)) | |
| matches = [] | |
| print("Scanning Parquet file for exact matches...") | |
| for row in tqdm(ds_target): | |
| target_text = row.get(TARGET_TEXT_COL, "") | |
| ngrams = get_ngrams(target_text, N_GRAM_SIZE) | |
| for ng in ngrams: | |
| if ng in source_map: | |
| # Retrieve the original Hellaswag sentences that have this ngram | |
| matching_source_sentences = source_map[ng] | |
| for (ind, src_sent) in matching_source_sentences: | |
| matches.append({ | |
| "ngram": " ".join(ng), | |
| "hellaswag_ind": ind, | |
| "hellaswag_row": src_sent, | |
| "parquet_row": target_text | |
| }) | |
| # leave only unique parquet_rows | |
| unique_matches = {} | |
| for m in matches: | |
| key = m['parquet_row'] | |
| unique_matches[key] = m | |
| matches = list(unique_matches.values()) | |
| # 4. Display Results | |
| print("\n" + "=" * 50) | |
| print(f"FOUND {len(matches)}/{len(ds_target)} MATCHES") | |
| print("=" * 50) | |
| for i, m in enumerate(matches): | |
| print(f"\n--- MATCH #{i + 1} ---") | |
| print(f"SHARED N-GRAM : '{m['ngram']}'") | |
| print(f"HELLASWAG ROW {m['hellaswag_ind']}: ...{m['hellaswag_row'][:100]}...") | |
| print(f"PARQUET ROW : ...{m['parquet_row'][:100]}...") | |
| if len(matches) > 0: | |
| import pandas as pd | |
| df = pd.DataFrame(matches) | |
| output_file = "overlap_matches.csv" | |
| df.to_csv(output_file, index=False) | |
| print(f"\nAll matches saved to '{output_file}'") | |
| print("\n" + "=" * 50) | |
| print(f"Match indices: {[m['hellaswag_ind'] for m in matches]}") | |
| if __name__ == "__main__": | |
| find_and_show_matches() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment