Skip to content

Instantly share code, notes, and snippets.

@mshivam019
Created February 16, 2026 14:53
Show Gist options
  • Select an option

  • Save mshivam019/c111fcb048fe910df44d5e2e3a871e4c to your computer and use it in GitHub Desktop.

Select an option

Save mshivam019/c111fcb048fe910df44d5e2e3a871e4c to your computer and use it in GitHub Desktop.
This script find similarity between submissions.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Load CSV
df = pd.read_csv("submissions.csv")
# Use the 'rationale' column
rationales = df['rationale'].fillna('')
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(rationales)
# Compute cosine similarity between all pairs
similarity_matrix = cosine_similarity(tfidf_matrix)
# Threshold for "almost same" (0.8 means 80% similar)
threshold = 0.8
# Find similar submissions
similar_pairs = []
for i in range(len(df)):
for j in range(i + 1, len(df)):
if similarity_matrix[i, j] >= threshold:
similar_pairs.append((df.iloc[i]['user_name'], df.iloc[j]['user_name'], similarity_matrix[i, j]))
print(f"Found {len(similar_pairs)} pairs of students with similar answers:")
for pair in similar_pairs:
print(f"Candidates: {pair[0]} and {pair[1]} - Similarity: {pair[2]:.2f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment