Randl/n_gram.py

## n_gram.py
import os
from collections import defaultdict

from datasets import load_dataset
from tqdm import tqdm

N_GRAM_SIZE = 13
PARQUET_FOLDER = "base_data"
TARGET_TEXT_COL = "text"


def get_ngrams(text, n):
    if not text: return []
    tokens = text.lower().split()
    if len(tokens) < n: return []
    return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]


def format_hellaswag(row):
    """Reconstruct full text: Context + Correct Ending"""
    if 'label' not in row or row['label'] == '':
        return row['ctx']
    correct_index = int(row['label'])
    return f"{row['ctx']} {row['endings'][correct_index]}"


def find_and_show_matches():
    # 1. Load Datasets
    print("Loading Hellaswag (Validation)...")
    ds_source = load_dataset("hellaswag", split="validation")

    parquet_files = [os.path.join(PARQUET_FOLDER, x) for x in os.listdir(PARQUET_FOLDER) if x.endswith(".parquet")]
    print(f"Loading {parquet_files}...")
    ds_target = load_dataset("parquet", data_files=parquet_files, split="train")

    # Map: { ngram_tuple : set( [hellaswag_text_1, hellaswag_text_2] ) }
    source_map = defaultdict(set)

    print(f"Indexing Hellaswag {N_GRAM_SIZE}-grams...")
    for row in tqdm(ds_source):
        full_text = format_hellaswag(row)
        ngrams = get_ngrams(full_text, N_GRAM_SIZE)

        for ng in ngrams:
            source_map[ng].add((row['ind'], full_text))

    matches = []

    print("Scanning Parquet file for exact matches...")
    for row in tqdm(ds_target):
        target_text = row.get(TARGET_TEXT_COL, "")
        ngrams = get_ngrams(target_text, N_GRAM_SIZE)

        for ng in ngrams:
            if ng in source_map:
                # Retrieve the original Hellaswag sentences that have this ngram
                matching_source_sentences = source_map[ng]

                for (ind, src_sent) in matching_source_sentences:
                    matches.append({
                        "ngram": " ".join(ng),
                        "hellaswag_ind": ind,
                        "hellaswag_row": src_sent,
                        "parquet_row": target_text
                    })
    # leave only unique parquet_rows
    unique_matches = {}
    for m in matches:
        key = m['parquet_row']
        unique_matches[key] = m
    matches = list(unique_matches.values())
    # 4. Display Results
    print("\n" + "=" * 50)
    print(f"FOUND {len(matches)}/{len(ds_target)} MATCHES")
    print("=" * 50)

    for i, m in enumerate(matches):
        print(f"\n--- MATCH #{i + 1} ---")
        print(f"SHARED N-GRAM : '{m['ngram']}'")
        print(f"HELLASWAG ROW {m['hellaswag_ind']}: ...{m['hellaswag_row'][:100]}...")
        print(f"PARQUET ROW   : ...{m['parquet_row'][:100]}...")

    if len(matches) > 0:
        import pandas as pd
        df = pd.DataFrame(matches)
        output_file = "overlap_matches.csv"
        df.to_csv(output_file, index=False)
        print(f"\nAll matches saved to '{output_file}'")
    print("\n" + "=" * 50)
    print(f"Match indices: {[m['hellaswag_ind'] for m in matches]}")


if __name__ == "__main__":
    find_and_show_matches()
	import os
	from collections import defaultdict

	from datasets import load_dataset
	from tqdm import tqdm

	N_GRAM_SIZE = 13
	PARQUET_FOLDER = "base_data"
	TARGET_TEXT_COL = "text"


	def get_ngrams(text, n):
	if not text: return []
	tokens = text.lower().split()
	if len(tokens) < n: return []
	return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]


	def format_hellaswag(row):
	"""Reconstruct full text: Context + Correct Ending"""
	if 'label' not in row or row['label'] == '':
	return row['ctx']
	correct_index = int(row['label'])
	return f"{row['ctx']} {row['endings'][correct_index]}"


	def find_and_show_matches():
	# 1. Load Datasets
	print("Loading Hellaswag (Validation)...")
	ds_source = load_dataset("hellaswag", split="validation")

	parquet_files = [os.path.join(PARQUET_FOLDER, x) for x in os.listdir(PARQUET_FOLDER) if x.endswith(".parquet")]
	print(f"Loading {parquet_files}...")
	ds_target = load_dataset("parquet", data_files=parquet_files, split="train")

	# Map: { ngram_tuple : set( [hellaswag_text_1, hellaswag_text_2] ) }
	source_map = defaultdict(set)

	print(f"Indexing Hellaswag {N_GRAM_SIZE}-grams...")
	for row in tqdm(ds_source):
	full_text = format_hellaswag(row)
	ngrams = get_ngrams(full_text, N_GRAM_SIZE)

	for ng in ngrams:
	source_map[ng].add((row['ind'], full_text))

	matches = []

	print("Scanning Parquet file for exact matches...")
	for row in tqdm(ds_target):
	target_text = row.get(TARGET_TEXT_COL, "")
	ngrams = get_ngrams(target_text, N_GRAM_SIZE)

	for ng in ngrams:
	if ng in source_map:
	# Retrieve the original Hellaswag sentences that have this ngram
	matching_source_sentences = source_map[ng]

	for (ind, src_sent) in matching_source_sentences:
	matches.append({
	"ngram": " ".join(ng),
	"hellaswag_ind": ind,
	"hellaswag_row": src_sent,
	"parquet_row": target_text
	})
	# leave only unique parquet_rows
	unique_matches = {}
	for m in matches:
	key = m['parquet_row']
	unique_matches[key] = m
	matches = list(unique_matches.values())
	# 4. Display Results
	print("\n" + "=" * 50)
	print(f"FOUND {len(matches)}/{len(ds_target)} MATCHES")
	print("=" * 50)

	for i, m in enumerate(matches):
	print(f"\n--- MATCH #{i + 1} ---")
	print(f"SHARED N-GRAM : '{m['ngram']}'")
	print(f"HELLASWAG ROW {m['hellaswag_ind']}: ...{m['hellaswag_row'][:100]}...")
	print(f"PARQUET ROW : ...{m['parquet_row'][:100]}...")

	if len(matches) > 0:
	import pandas as pd
	df = pd.DataFrame(matches)
	output_file = "overlap_matches.csv"
	df.to_csv(output_file, index=False)
	print(f"\nAll matches saved to '{output_file}'")
	print("\n" + "=" * 50)
	print(f"Match indices: {[m['hellaswag_ind'] for m in matches]}")


	if __name__ == "__main__":
	find_and_show_matches()
No results found