Created
May 25, 2025 10:31
-
-
Save Dranaxel/80e5906b6491de40d1390f71127ed08f to your computer and use it in GitHub Desktop.
SWE-Bench analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import requests | |
| import time | |
| import os | |
| import matplotlib.pyplot as plt | |
| from tqdm import tqdm | |
| from urllib.parse import urlparse | |
| # === Configuration === | |
| # Liste des chemins vers les datasets Parquet | |
| DATASET_PATHS = [ | |
| "hf://datasets/princeton-nlp/SWE-bench/data/dev-00000-of-00001.parquet", | |
| "hf://datasets/princeton-nlp/SWE-bench_Verified/data/test-00000-of-00001.parquet" | |
| ] | |
| # Token GitHub (à sécuriser via variable d'environnement idéalement) | |
| GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "ghp_wXXXXX") | |
| HEADERS = { | |
| "Authorization": f"token {GITHUB_TOKEN}", | |
| "Accept": "application/vnd.github.v3+json" | |
| } | |
| # === Fonctions === | |
| def get_primary_language(repo_full_name): | |
| url = f"https://api.github.com/repos/{repo_full_name}/languages" | |
| response = requests.get(url, headers=HEADERS) | |
| if response.status_code == 200: | |
| languages = response.json() | |
| if not languages: | |
| return None | |
| return max(languages.items(), key=lambda x: x[1])[0] | |
| else: | |
| print(f"⚠️ Erreur {response.status_code} pour {repo_full_name}") | |
| return None | |
| def extract_filename(path): | |
| """Retourne un nom de fichier propre à partir du chemin (local ou HF).""" | |
| return os.path.basename(urlparse(path).path).replace(".parquet", "") | |
| def process_dataset(dataset_path): | |
| print(f"\n📂 Traitement du dataset : {dataset_path}") | |
| filename_base = extract_filename(dataset_path) | |
| # Lire le fichier Parquet | |
| df = pd.read_parquet(dataset_path) | |
| # Récupérer les repos uniques | |
| repos = df["repo"].dropna().unique() | |
| # Appeler l'API GitHub pour chaque repo | |
| repo_language_map = {} | |
| for repo in tqdm(repos, desc=f"🔍 Repos dans {filename_base}"): | |
| if repo in repo_language_map: | |
| continue | |
| lang = get_primary_language(repo) | |
| repo_language_map[repo] = lang | |
| time.sleep(1) # Respecter les limites de l'API | |
| # Ajouter la colonne des langages | |
| df["primary_language"] = df["repo"].map(repo_language_map) | |
| # Sauvegarder le CSV enrichi | |
| csv_path = f"{filename_base}_with_languages.csv" | |
| df.to_csv(csv_path, index=False) | |
| print(f"✅ Données enrichies sauvegardées : {csv_path}") | |
| # Générer le camembert | |
| lang_counts = df["primary_language"].dropna().value_counts() | |
| if not lang_counts.empty: | |
| plt.figure(figsize=(10, 8)) | |
| plt.pie(lang_counts, labels=lang_counts.index, autopct="%1.1f%%", startangle=140) | |
| plt.axis("equal") | |
| plt.title(f"Répartition des langages - {filename_base}") | |
| plt.tight_layout() | |
| chart_path = f"{filename_base}_language_distribution.png" | |
| plt.savefig(chart_path) | |
| plt.close() | |
| print(f"📊 Camembert sauvegardé : {chart_path}") | |
| else: | |
| print(f"⚠️ Aucun langage détecté pour {filename_base}, pas de graphique généré.") | |
| # === Traitement de tous les datasets === | |
| for path in DATASET_PATHS: | |
| process_dataset(path) | |
| print("\n🎉 Tous les datasets ont été traités.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment