Skip to content

Instantly share code, notes, and snippets.

@Dranaxel
Created May 25, 2025 10:31
Show Gist options
  • Select an option

  • Save Dranaxel/80e5906b6491de40d1390f71127ed08f to your computer and use it in GitHub Desktop.

Select an option

Save Dranaxel/80e5906b6491de40d1390f71127ed08f to your computer and use it in GitHub Desktop.
SWE-Bench analysis
import pandas as pd
import requests
import time
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from urllib.parse import urlparse
# === Configuration ===
# Liste des chemins vers les datasets Parquet
DATASET_PATHS = [
"hf://datasets/princeton-nlp/SWE-bench/data/dev-00000-of-00001.parquet",
"hf://datasets/princeton-nlp/SWE-bench_Verified/data/test-00000-of-00001.parquet"
]
# Token GitHub (à sécuriser via variable d'environnement idéalement)
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "ghp_wXXXXX")
HEADERS = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json"
}
# === Fonctions ===
def get_primary_language(repo_full_name):
url = f"https://api.github.com/repos/{repo_full_name}/languages"
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
languages = response.json()
if not languages:
return None
return max(languages.items(), key=lambda x: x[1])[0]
else:
print(f"⚠️ Erreur {response.status_code} pour {repo_full_name}")
return None
def extract_filename(path):
"""Retourne un nom de fichier propre à partir du chemin (local ou HF)."""
return os.path.basename(urlparse(path).path).replace(".parquet", "")
def process_dataset(dataset_path):
print(f"\n📂 Traitement du dataset : {dataset_path}")
filename_base = extract_filename(dataset_path)
# Lire le fichier Parquet
df = pd.read_parquet(dataset_path)
# Récupérer les repos uniques
repos = df["repo"].dropna().unique()
# Appeler l'API GitHub pour chaque repo
repo_language_map = {}
for repo in tqdm(repos, desc=f"🔍 Repos dans {filename_base}"):
if repo in repo_language_map:
continue
lang = get_primary_language(repo)
repo_language_map[repo] = lang
time.sleep(1) # Respecter les limites de l'API
# Ajouter la colonne des langages
df["primary_language"] = df["repo"].map(repo_language_map)
# Sauvegarder le CSV enrichi
csv_path = f"{filename_base}_with_languages.csv"
df.to_csv(csv_path, index=False)
print(f"✅ Données enrichies sauvegardées : {csv_path}")
# Générer le camembert
lang_counts = df["primary_language"].dropna().value_counts()
if not lang_counts.empty:
plt.figure(figsize=(10, 8))
plt.pie(lang_counts, labels=lang_counts.index, autopct="%1.1f%%", startangle=140)
plt.axis("equal")
plt.title(f"Répartition des langages - {filename_base}")
plt.tight_layout()
chart_path = f"{filename_base}_language_distribution.png"
plt.savefig(chart_path)
plt.close()
print(f"📊 Camembert sauvegardé : {chart_path}")
else:
print(f"⚠️ Aucun langage détecté pour {filename_base}, pas de graphique généré.")
# === Traitement de tous les datasets ===
for path in DATASET_PATHS:
process_dataset(path)
print("\n🎉 Tous les datasets ont été traités.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment