Last active
August 3, 2025 11:41
-
-
Save stranger9977/19cb82abc218c97fc70ca6f573ad3487 to your computer and use it in GitHub Desktop.
wrfeatureimportanceanalysis.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyPWy7wDVmquVo4h4xf7lzWr", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/stranger9977/19cb82abc218c97fc70ca6f573ad3487/wrfeatureimportanceanalysis.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Set up" | |
| ], | |
| "metadata": { | |
| "id": "5dMvaXG3RhZ9" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "gg_SPqYZiGWM", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 1000 | |
| }, | |
| "outputId": "5441c77f-4cc9-45f2-8e9b-f971e0d9ab7f" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Collecting nfl_data_py\n", | |
| " Downloading nfl_data_py-0.3.3-py3-none-any.whl.metadata (12 kB)\n", | |
| "Collecting numpy<2.0,>=1.0 (from nfl_data_py)\n", | |
| " Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.0/61.0 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hCollecting pandas<2.0,>=1.0 (from nfl_data_py)\n", | |
| " Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n", | |
| "Collecting appdirs>1 (from nfl_data_py)\n", | |
| " Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)\n", | |
| "Collecting fastparquet>0.5 (from nfl_data_py)\n", | |
| " Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)\n", | |
| "Requirement already satisfied: cramjam>=2.3 in /usr/local/lib/python3.11/dist-packages (from fastparquet>0.5->nfl_data_py) (2.10.0)\n", | |
| "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from fastparquet>0.5->nfl_data_py) (2025.3.0)\n", | |
| "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from fastparquet>0.5->nfl_data_py) (25.0)\n", | |
| "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.11/dist-packages (from pandas<2.0,>=1.0->nfl_data_py) (2.9.0.post0)\n", | |
| "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas<2.0,>=1.0->nfl_data_py) (2025.2)\n", | |
| "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.1->pandas<2.0,>=1.0->nfl_data_py) (1.17.0)\n", | |
| "Downloading nfl_data_py-0.3.3-py3-none-any.whl (13 kB)\n", | |
| "Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n", | |
| "Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m23.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.3/18.3 MB\u001b[0m \u001b[31m82.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.0/12.0 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hInstalling collected packages: appdirs, numpy, pandas, fastparquet, nfl_data_py\n", | |
| " Attempting uninstall: numpy\n", | |
| " Found existing installation: numpy 2.0.2\n", | |
| " Uninstalling numpy-2.0.2:\n", | |
| " Successfully uninstalled numpy-2.0.2\n", | |
| " Attempting uninstall: pandas\n", | |
| " Found existing installation: pandas 2.2.2\n", | |
| " Uninstalling pandas-2.2.2:\n", | |
| " Successfully uninstalled pandas-2.2.2\n", | |
| "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", | |
| "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 1.5.3 which is incompatible.\n", | |
| "arviz 0.22.0 requires pandas>=2.1.0, but you have pandas 1.5.3 which is incompatible.\n", | |
| "opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n", | |
| "geopandas 1.1.1 requires pandas>=2.0.0, but you have pandas 1.5.3 which is incompatible.\n", | |
| "opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n", | |
| "opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n", | |
| "dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.\n", | |
| "plotnine 0.14.5 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.\n", | |
| "thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.\n", | |
| "xarray 2025.7.1 requires pandas>=2.2, but you have pandas 1.5.3 which is incompatible.\n", | |
| "cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.\n", | |
| "mizani 0.13.5 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.\u001b[0m\u001b[31m\n", | |
| "\u001b[0mSuccessfully installed appdirs-1.4.4 fastparquet-2024.11.0 nfl_data_py-0.3.3 numpy-1.26.4 pandas-1.5.3\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.colab-display-data+json": { | |
| "pip_warning": { | |
| "packages": [ | |
| "numpy", | |
| "pandas" | |
| ] | |
| }, | |
| "id": "35639952484f4cba8819068799d5a783" | |
| } | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Collecting BorutaShap\n", | |
| " Downloading BorutaShap-1.0.17-py3-none-any.whl.metadata (7.8 kB)\n", | |
| "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (1.6.1)\n", | |
| "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (4.67.1)\n", | |
| "Requirement already satisfied: statsmodels in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (0.14.5)\n", | |
| "Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (3.10.0)\n", | |
| "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (1.5.3)\n", | |
| "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (1.26.4)\n", | |
| "Requirement already satisfied: shap>=0.34.0 in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (0.48.0)\n", | |
| "Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (0.13.2)\n", | |
| "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from BorutaShap) (1.16.0)\n", | |
| "Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.11/dist-packages (from shap>=0.34.0->BorutaShap) (25.0)\n", | |
| "Requirement already satisfied: slicer==0.0.8 in /usr/local/lib/python3.11/dist-packages (from shap>=0.34.0->BorutaShap) (0.0.8)\n", | |
| "Requirement already satisfied: numba>=0.54 in /usr/local/lib/python3.11/dist-packages (from shap>=0.34.0->BorutaShap) (0.60.0)\n", | |
| "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.11/dist-packages (from shap>=0.34.0->BorutaShap) (3.1.1)\n", | |
| "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.11/dist-packages (from shap>=0.34.0->BorutaShap) (4.14.1)\n", | |
| "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->BorutaShap) (1.3.2)\n", | |
| "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib->BorutaShap) (0.12.1)\n", | |
| "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib->BorutaShap) (4.59.0)\n", | |
| "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->BorutaShap) (1.4.8)\n", | |
| "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib->BorutaShap) (11.3.0)\n", | |
| "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->BorutaShap) (3.2.3)\n", | |
| "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib->BorutaShap) (2.9.0.post0)\n", | |
| "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->BorutaShap) (2025.2)\n", | |
| "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn->BorutaShap) (1.5.1)\n", | |
| "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn->BorutaShap) (3.6.0)\n", | |
| "Requirement already satisfied: patsy>=0.5.6 in /usr/local/lib/python3.11/dist-packages (from statsmodels->BorutaShap) (1.0.1)\n", | |
| "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.11/dist-packages (from numba>=0.54->shap>=0.34.0->BorutaShap) (0.43.0)\n", | |
| "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.7->matplotlib->BorutaShap) (1.17.0)\n", | |
| "Downloading BorutaShap-1.0.17-py3-none-any.whl (14 kB)\n", | |
| "\u001b[31mERROR: Operation cancelled by user\u001b[0m\u001b[31m\n", | |
| "\u001b[0m^C\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%pip install nfl_data_py\n", | |
| "%pip install BorutaShap\n", | |
| "%pip install --upgrade \"scipy<1.11\"\n", | |
| "\n", | |
| "# for github\n", | |
| "%pip install --upgrade ipywidgets nbformat jupyter nbextension enable --py widgetsnbextension" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Ingest Data" | |
| ], | |
| "metadata": { | |
| "id": "xUb-Pi6gRpvS" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import nfl_data_py as nfl\n", | |
| "pd.options.display.float_format = '{:.3f}'.format\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "# Conatants\n", | |
| "POSITION = 'WR'\n", | |
| "YEARS = list(range(2014, 2025))\n", | |
| "# Ingest play-by-play data from 2014 to 2023\n", | |
| "pbp = nfl.import_pbp_data(YEARS)\n", | |
| "\n", | |
| "# Filter only pass plays with valid receiver and standard play context\n", | |
| "pbp = pbp[\n", | |
| " (pbp['play_type'] == 'pass') &\n", | |
| " (pbp['down'].notnull()) &\n", | |
| " (pbp['receiver_player_id'].notnull()) &\n", | |
| " (pbp['pass_attempt'] == 1) & # confirm pass attempt\n", | |
| " (pbp['qb_spike'] != 1) &\n", | |
| " (pbp['qb_kneel'] != 1) &\n", | |
| " (pbp['two_point_attempt'] != 1) & # remove 2pt convs\n", | |
| " (pbp['special_teams_play'] != 1) &\n", | |
| " (pbp['play_deleted'] != 1) &\n", | |
| " (pbp['yards_gained'].notnull()) &\n", | |
| " (pbp['air_yards'].notnull()) # exclude screens/behind LoS if desired\n", | |
| "]\n", | |
| "\n", | |
| "# Calculate PPR fantasy points for WRs\n", | |
| "pbp['fantasy_points_ppr'] = (\n", | |
| " 0.1 * pbp['receiving_yards'].fillna(0) +\n", | |
| " 6 * pbp['pass_touchdown'].fillna(0) +\n", | |
| " 1 * pbp['complete_pass'].fillna(0) +\n", | |
| " -2 * pbp['fumble_lost'].fillna(0)\n", | |
| ")\n", | |
| "\n", | |
| "# Optional: create player-season identifier\n", | |
| "pbp['player_season'] = pbp['receiver_player_id'] + \"_\" + pbp['season'].astype(str)\n", | |
| "\n", | |
| "# Preview cleaned structure\n", | |
| "print(pbp[['season', 'week', 'game_id', 'receiver_player_name', 'fantasy_points_ppr']].sample(5))\n", | |
| "\n" | |
| ], | |
| "metadata": { | |
| "id": "vGFguv4ziqnL", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "e6c9c555-33a5-4036-abab-20a9a39529be" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "2014 done.\n", | |
| "2015 done.\n", | |
| "2016 done.\n", | |
| "2017 done.\n", | |
| "2018 done.\n", | |
| "2019 done.\n", | |
| "2020 done.\n", | |
| "2021 done.\n", | |
| "2022 done.\n", | |
| "2023 done.\n", | |
| "2024 done.\n", | |
| "Downcasting floats.\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "sorted(pbp.columns)" | |
| ], | |
| "metadata": { | |
| "id": "U4QW58rfucti" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Inspect Snap Data\n", | |
| "Snaps will be a proxy for routes run ONLY in the case where its a pass play and its a WR snap. This assumption doesnt hold for TEs and RBs who stay in to block more often on pass plays." | |
| ], | |
| "metadata": { | |
| "id": "HRfKWSI_RvXN" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "snaps = nfl.import_snap_counts(YEARS)\n", | |
| "snaps.tail()" | |
| ], | |
| "metadata": { | |
| "id": "2_AfEZlRz0Gz" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| " id_map = nfl.import_ids()[['gsis_id','pfr_id','sleeper_id','draft_pick','age','height','weight','name','position']].rename(columns={'pfr_id':'pfr_player_id'})\n", | |
| " id_map.head()" | |
| ], | |
| "metadata": { | |
| "id": "uP3u6PTJygF-" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Feature Defintition and Engineering\n", | |
| "Add more features you want to test here" | |
| ], | |
| "metadata": { | |
| "id": "jo4oU0l3R4Kf" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import numpy as np\n", | |
| "import nfl_data_py as nfl\n", | |
| "from sklearn.model_selection import KFold, cross_val_predict\n", | |
| "from xgboost import XGBRegressor\n", | |
| "\n", | |
| "# 1) Play‐level feature engineering\n", | |
| "def engineer_play_level_features(df):\n", | |
| " df = df.copy()\n", | |
| " df['target'] = df['receiver_player_id'].notnull().astype(int)\n", | |
| " df['is_zone'] = (df['defense_man_zone_type']=='ZONE_COVERAGE').astype(int)\n", | |
| " df['is_man'] = (df['defense_man_zone_type']=='MAN_COVERAGE').astype(int)\n", | |
| " # clipping negative yardage at 0\n", | |
| " df['air_yards'] = df['air_yards'].clip(lower=0)\n", | |
| " df['rec_yards'] = df['receiving_yards'].clip(lower=0)\n", | |
| " df['yac_yards'] = df['yards_after_catch'].clip(lower=0)\n", | |
| "\n", | |
| " df['first_down'] = df['first_down_pass']\n", | |
| " df['fantasy_ppr'] = df['fantasy_points_ppr']\n", | |
| " df['team'] = df['posteam']\n", | |
| " metrics = ['target','first_down','air_yards','rec_yards','yac_yards','fantasy_ppr']\n", | |
| " for m in metrics:\n", | |
| " df[f'{m}_zone'] = df[m] * df['is_zone']\n", | |
| " df[f'{m}_man'] = df[m] * df['is_man']\n", | |
| " return df\n", | |
| "\n", | |
| "# 2) Over‐expected residuals\n", | |
| "def add_over_expected(df):\n", | |
| " df = df.copy()\n", | |
| " cols = ['yardline_100','down','ydstogo','qtr','wp','ep']\n", | |
| " mask = df[cols].notna().all(axis=1) & df['rec_yards'].notna()\n", | |
| " X, y = df.loc[mask, cols], df.loc[mask, 'rec_yards']\n", | |
| " model = XGBRegressor(max_depth=3, n_estimators=100, verbosity=0)\n", | |
| " cv = KFold(n_splits=10, shuffle=True, random_state=42)\n", | |
| " preds = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)\n", | |
| " df['oe_rec_yards'] = np.nan\n", | |
| " df.loc[mask, 'oe_rec_yards'] = y - preds\n", | |
| " df['oe_yac'] = df['yac_yards'] - df['xyac_mean_yardage']\n", | |
| " df[['oe_rec_yards','oe_yac']] = df[['oe_rec_yards','oe_yac']].fillna(0)\n", | |
| " return df\n", | |
| "\n", | |
| "\n", | |
| "def aggregate_to_week(df, year, position):\n", | |
| " df_feat = add_over_expected(engineer_play_level_features(df))\n", | |
| " count_cols = ['target','rec_yards','yac_yards','first_down','air_yards','fantasy_ppr','oe_rec_yards','oe_yac']\n", | |
| " splits = [f'{m}_{z}' for m in ['target','first_down','air_yards','rec_yards','yac_yards','fantasy_ppr'] for z in ['zone','man']]\n", | |
| " weekly_sum = df_feat.groupby(['receiver_player_id','season','team','week'], as_index=False)[count_cols + splits].sum()\n", | |
| "\n", | |
| " # 1) pull in snap counts + player biographics\n", | |
| " snaps = nfl.import_snap_counts(YEARS)[['season','week','pfr_player_id','offense_snaps']]\n", | |
| " id_map = (\n", | |
| " nfl.import_ids()[[\n", | |
| " 'gsis_id','pfr_id','sleeper_id',\n", | |
| " 'draft_pick','age','height','weight',\n", | |
| " 'name','position'\n", | |
| " ]]\n", | |
| " .rename(columns={'pfr_id':'pfr_player_id'})\n", | |
| " )\n", | |
| "\n", | |
| " id_map = id_map[id_map['position']==position]\n", | |
| "\n", | |
| "\n", | |
| " # 2) impute biographic missings\n", | |
| " id_map['draft_pick'] = id_map['draft_pick'].fillna(id_map['draft_pick'].max() + 1)\n", | |
| " id_map['age'] = id_map['age'].fillna(id_map['age'].median())\n", | |
| " id_map['height'] = id_map['height'].fillna(id_map['height'].median())\n", | |
| " id_map['weight'] = id_map['weight'].fillna(id_map['weight'].median())\n", | |
| "\n", | |
| " snap_map = snaps.merge(id_map, on='pfr_player_id', how='left')\n", | |
| "\n", | |
| " # 3) join with weekly sums\n", | |
| " weekly = weekly_sum.merge(\n", | |
| " snap_map[[\n", | |
| " 'gsis_id','sleeper_id','draft_pick','age','height','weight',\n", | |
| " 'name','position','season','week','offense_snaps'\n", | |
| " ]],\n", | |
| " left_on=['receiver_player_id','season','week'],\n", | |
| " right_on=['gsis_id','season','week'],\n", | |
| " how='left'\n", | |
| " )\n", | |
| " weekly['route_run'] = weekly['offense_snaps'].fillna(0)\n", | |
| " weekly.drop(columns=['gsis_id','offense_snaps'], inplace=True)\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| " # 4) per‐route rates & pct splits\n", | |
| " den = weekly['route_run'].replace(0, np.nan)\n", | |
| " for col in ['rec_yards','yac_yards','target','first_down','air_yards','fantasy_ppr','oe_rec_yards','oe_yac']:\n", | |
| " weekly[f'{col}_rprr'] = weekly[col] / den\n", | |
| " for base in ['target','first_down','air_yards','rec_yards','yac_yards','fantasy_ppr']:\n", | |
| " weekly[f'{base}_zone_pct'] = weekly[f'{base}_zone'] / weekly[base].replace(0, np.nan)\n", | |
| " weekly[f'{base}_man_pct'] = weekly[f'{base}_man'] / weekly[base].replace(0, np.nan)\n", | |
| "\n", | |
| " weekly = weekly[weekly['position']==position]\n", | |
| "\n", | |
| " return weekly\n", | |
| "\n", | |
| "\n", | |
| "# 4) Season aggregation\n", | |
| "def aggregate_to_season(weekly_df):\n", | |
| " sum_cols = [c for c in weekly_df.columns if c not in ['receiver_player_id','sleeper_id','name','position','season','team','week']]\n", | |
| " agg_dict = {col:(col,'sum') for col in sum_cols}\n", | |
| " agg_dict['games'] = ('week','nunique')\n", | |
| " season = weekly_df.groupby(['receiver_player_id','sleeper_id', 'name','position','team','season'], as_index=False).agg(**agg_dict)\n", | |
| "\n", | |
| " den = season['route_run'].replace(0, np.nan)\n", | |
| " for col in ['rec_yards','yac_yards','target','first_down','air_yards','fantasy_ppr','oe_rec_yards','oe_yac']:\n", | |
| " season[f'{col}_rprr'] = season[col] / den\n", | |
| " season['fantasy_ppr_per_game'] = season['fantasy_ppr'] / season['games']\n", | |
| " for b in ['target','first_down','air_yards','rec_yards','yac_yards','fantasy_ppr']:\n", | |
| " season[f'{b}_zone_pct'] = season[f'{b}_zone'] / season[b].replace(0,np.nan)\n", | |
| " season[f'{b}_man_pct'] = season[f'{b}_man'] / season[b].replace(0,np.nan)\n", | |
| " season.replace([np.inf,-np.inf],np.nan, inplace=True)\n", | |
| " # #### ARBITRARY CUT OFF TO RECREATE DWAIN MCFARLANDS METHODOLOGY\n", | |
| " # season = season.loc[season['route_run'] >= 250]\n", | |
| "\n", | |
| " return season\n", | |
| "\n", | |
| "# Usage\n", | |
| "weekly_wr = aggregate_to_week(pbp, YEARS, POSITION)\n", | |
| "season_stats = aggregate_to_season(weekly_wr)\n", | |
| "season_stats.head()\n" | |
| ], | |
| "metadata": { | |
| "id": "aKmudV0vn3FV" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Inspect Data" | |
| ], | |
| "metadata": { | |
| "id": "0ylyvCObSCfc" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "season_stats.describe(include='all').T" | |
| ], | |
| "metadata": { | |
| "id": "2GwxJaLep39M" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Leader Boards\n", | |
| "Because they are fun and to check to see if metrics pass the smell test- We filtered to a minimum snap count in a season before this, keep that in mind" | |
| ], | |
| "metadata": { | |
| "id": "VqVgQKwdSFJi" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# ─── Leaderboards with 350-route cutoff applied only here ────────────────────\n", | |
| "\n", | |
| "# 1) filter for players who ran at least 350 routes in a season\n", | |
| "leaderboard = season_stats.loc[season_stats['route_run'] >= 350]\n", | |
| "\n", | |
| "# 2) Top 20 by over-expected receiving yards per route\n", | |
| "print(\"Top 20 players by over-expected receiving yards per route:\")\n", | |
| "display(\n", | |
| " leaderboard\n", | |
| " .sort_values('oe_rec_yards_rprr', ascending=False)\n", | |
| " .head(20)[['season', 'name', 'team','position', 'oe_rec_yards_rprr']]\n", | |
| ")\n", | |
| "\n", | |
| "# 3) Top 20 by over-expected yards after catch per route\n", | |
| "print(\"\\nTop 20 players by over-expected yards after catch per route:\")\n", | |
| "display(\n", | |
| " leaderboard\n", | |
| " .sort_values('oe_yac_rprr', ascending=False)\n", | |
| " .head(20)[['season', 'name', 'team','position', 'oe_yac_rprr']]\n", | |
| ")\n", | |
| "\n", | |
| "# 4) Top 20 by air yards per route\n", | |
| "print(\"\\nTop 20 players by air yards per route:\")\n", | |
| "display(\n", | |
| " leaderboard\n", | |
| " .sort_values('air_yards_rprr', ascending=False)\n", | |
| " .head(20)[['season', 'name', 'team','position', 'air_yards_rprr']]\n", | |
| ")\n", | |
| "\n", | |
| "# 5) Top 20 by targets per route\n", | |
| "print(\"\\nTop 20 players by targets per route:\")\n", | |
| "display(\n", | |
| " leaderboard\n", | |
| " .sort_values('target_rprr', ascending=False)\n", | |
| " .head(20)[['season', 'name','team', 'position', 'target_rprr']]\n", | |
| ")\n" | |
| ], | |
| "metadata": { | |
| "id": "STPwxjXq3i8z" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Recreate Stability and Predicitve Correlation Analysis from Dwain's Tweet" | |
| ], | |
| "metadata": { | |
| "id": "RmOnyRI_S13h" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# 2) build 1-year lags\n", | |
| "metrics = [\n", | |
| " 'fantasy_ppr_per_game','target_rprr','first_down_rprr','target_zone_pct',\n", | |
| " 'target_man_pct','first_down_zone_pct','first_down_man_pct','rec_yards_rprr',\n", | |
| " 'oe_rec_yards_rprr','air_yards_rprr','air_yards_zone_pct','air_yards_man_pct',\n", | |
| " 'yac_yards_rprr','oe_yac_rprr','yac_yards_zone_pct','yac_yards_man_pct'\n", | |
| "]\n", | |
| "\n", | |
| "# 0) your display‐name mapping\n", | |
| "display_map = {\n", | |
| " 'fantasy_ppr_per_game': 'PPR',\n", | |
| " 'target_rprr': 'Target %',\n", | |
| " 'first_down_rprr': '1D/RR',\n", | |
| " 'target_zone_pct': 'Zone Target %',\n", | |
| " 'target_man_pct': 'Man Target %',\n", | |
| " 'first_down_zone_pct': 'Zone 1D %',\n", | |
| " 'first_down_man_pct': 'Man 1D %',\n", | |
| " 'rec_yards_rprr': 'YPRR',\n", | |
| " 'oe_rec_yards_rprr': 'YPRR OE',\n", | |
| " 'air_yards_rprr': 'Air Yards/RR',\n", | |
| " 'air_yards_zone_pct': 'Zone Air Yards %',\n", | |
| " 'air_yards_man_pct': 'Man Air Yards %',\n", | |
| " 'yac_yards_rprr': 'YAC/RR',\n", | |
| " 'oe_yac_rprr': 'YAC OE/RR',\n", | |
| " 'yac_yards_zone_pct': 'Zone YAC %',\n", | |
| " 'yac_yards_man_pct': 'Man YAC %'\n", | |
| "}\n", | |
| "\n", | |
| "# helper functions\n", | |
| "def adjusted_r2(r2, n, p=1):\n", | |
| " \"\"\"Adjusted R² for one predictor.\"\"\"\n", | |
| " return 1 - (1 - r2) * (n - 1) / (n - p - 1)\n", | |
| "\n", | |
| "def fisher_shrunken_r2(r, n):\n", | |
| " \"\"\"Fisher’s z–shrinkage of Pearson r, squared.\"\"\"\n", | |
| " z = np.arctanh(r)\n", | |
| " z_shrunk = z * (n - 3) / (n - 1)\n", | |
| " return np.tanh(z_shrunk) ** 2\n", | |
| "\n", | |
| "# 1) ensure your lag features and ppr_next exist\n", | |
| "ws = season_stats[season_stats['position']=='WR'].copy()\n", | |
| "ws = ws.sort_values(['receiver_player_id','season'])\n", | |
| "ws['ppr_next'] = ws.groupby('receiver_player_id')['fantasy_ppr_per_game'].shift(-1)\n", | |
| "for m in metrics:\n", | |
| " ws[m + '_lag'] = ws.groupby('receiver_player_id')[m].shift(1)\n", | |
| "\n", | |
| "# 2) compute stability & multiple predictive R² estimates\n", | |
| "results = []\n", | |
| "for m in metrics:\n", | |
| " stab_df = ws.dropna(subset=[m, m + '_lag'])\n", | |
| " r_stab = stab_df[m].corr(stab_df[m + '_lag'])\n", | |
| " stab_r2 = r_stab**2 if pd.notna(r_stab) else np.nan\n", | |
| "\n", | |
| " pred_df = ws.dropna(subset=[m, 'ppr_next'])\n", | |
| " n = len(pred_df)\n", | |
| " if n >= 4:\n", | |
| " r_pred = pred_df[m].corr(pred_df['ppr_next'])\n", | |
| " raw_r2 = r_pred**2\n", | |
| " adj_r2 = adjusted_r2(raw_r2, n)\n", | |
| " shrunk_r2 = fisher_shrunken_r2(r_pred, n)\n", | |
| " else:\n", | |
| " raw_r2 = adj_r2 = shrunk_r2 = np.nan\n", | |
| "\n", | |
| " results.append({\n", | |
| " 'metric': m,\n", | |
| " 'Stability (R²)': stab_r2,\n", | |
| " 'Raw Predictive (R²)': raw_r2,\n", | |
| " 'Adjusted Predictive (R²)': adj_r2,\n", | |
| " 'Shrunken Predictive (R²)': shrunk_r2\n", | |
| " })\n", | |
| "\n", | |
| "# 3) assemble & rename\n", | |
| "df_final = (\n", | |
| " pd.DataFrame(results)\n", | |
| " .set_index('metric')\n", | |
| " .rename(index=display_map)\n", | |
| ")\n", | |
| "\n", | |
| "# 4) sort by the most conservative (shrunken) R²\n", | |
| "display(df_final.sort_values('Shrunken Predictive (R²)', ascending=False))\n" | |
| ], | |
| "metadata": { | |
| "id": "DY3Id3Vh6TK0" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Prep for Feature Selection" | |
| ], | |
| "metadata": { | |
| "id": "mbsXlSb1ZB_e" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Boruta Shap\n", | |
| "Results are slightly different than tweet because I tweaked the snap count threshold" | |
| ], | |
| "metadata": { | |
| "id": "orbrZ5icZVSb" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import shap\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "\n", | |
| "from xgboost import XGBRegressor\n", | |
| "from BorutaShap import BorutaShap\n", | |
| "\n", | |
| "from sklearn.impute import KNNImputer\n", | |
| "from sklearn.preprocessing import OneHotEncoder, MinMaxScaler\n", | |
| "from sklearn.metrics import r2_score, root_mean_squared_error\n", | |
| "\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "# 0) Precondition: `ws` DataFrame contains:\n", | |
| "# – one-year lags for each metric, ending in \"_lag\"\n", | |
| "# – bio cols: 'draft_pick', 'age', 'height', 'weight'\n", | |
| "# – 'team' (string)\n", | |
| "# – target 'fantasy_ppr_per_game'\n", | |
| "# – 'route_run' (for sample weights)\n", | |
| "# – 'season' (int year)\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "\n", | |
| "# 1) Drop any rows missing the target\n", | |
| "df = ws.dropna(subset=['fantasy_ppr_per_game']).copy()\n", | |
| "\n", | |
| "# 2) Identify columns\n", | |
| "lag_cols = [c for c in df.columns if c.endswith('_lag')]\n", | |
| "bio_cols = ['draft_pick','age','height','weight']\n", | |
| "cat_cols = ['team']\n", | |
| "all_impute = lag_cols + bio_cols + cat_cols\n", | |
| "select_lags = lag_cols + ['age'] # for SHAP & Boruta\n", | |
| "target_col = 'fantasy_ppr_per_game'\n", | |
| "\n", | |
| "# 3) Chronological train/test split\n", | |
| "train_mask = df['season'] < 2024\n", | |
| "X_imp_train = df.loc[train_mask, all_impute]\n", | |
| "X_imp_test = df.loc[~train_mask, all_impute]\n", | |
| "y_train = df.loc[train_mask, target_col]\n", | |
| "y_test = df.loc[~train_mask, target_col]\n", | |
| "w_train = df.loc[train_mask, 'route_run']\n", | |
| "\n", | |
| "# 4) One‐hot encode `team` for imputation\n", | |
| "ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')\n", | |
| "team_train_ohe = pd.DataFrame(\n", | |
| " ohe.fit_transform(X_imp_train[['team']]),\n", | |
| " columns=ohe.get_feature_names_out(['team']),\n", | |
| " index=X_imp_train.index\n", | |
| ")\n", | |
| "team_test_ohe = pd.DataFrame(\n", | |
| " ohe.transform(X_imp_test[['team']]),\n", | |
| " columns=ohe.get_feature_names_out(['team']),\n", | |
| " index=X_imp_test.index\n", | |
| ")\n", | |
| "\n", | |
| "# Assemble numeric + dummies for KNN\n", | |
| "X_num_train = pd.concat([X_imp_train[lag_cols + bio_cols], team_train_ohe], axis=1)\n", | |
| "X_num_test = pd.concat([X_imp_test[lag_cols + bio_cols], team_test_ohe ], axis=1)\n", | |
| "\n", | |
| "# 5) Impute missing with KNN\n", | |
| "imputer = KNNImputer(n_neighbors=5, weights='distance')\n", | |
| "X_train_imp = pd.DataFrame(imputer.fit_transform(X_num_train),\n", | |
| " columns=X_num_train.columns, index=X_num_train.index)\n", | |
| "X_test_imp = pd.DataFrame(imputer.transform(X_num_test),\n", | |
| " columns=X_num_test.columns, index=X_num_test.index)\n", | |
| "\n", | |
| "# 6) Drop dummy cols post‐imputation\n", | |
| "drop_teams = ohe.get_feature_names_out(['team'])\n", | |
| "X_train_num = X_train_imp.drop(columns=drop_teams)\n", | |
| "X_test_num = X_test_imp.drop(columns=drop_teams)\n", | |
| "\n", | |
| "# 7) Scale to [0,1]\n", | |
| "scaler = MinMaxScaler()\n", | |
| "X_train = pd.DataFrame(scaler.fit_transform(X_train_num),\n", | |
| " columns=X_train_num.columns, index=X_train_num.index)\n", | |
| "X_test = pd.DataFrame(scaler.transform(X_test_num),\n", | |
| " columns=X_test_num.columns, index=X_test_num.index)\n", | |
| "\n", | |
| "# 8) Define SHAP/Boruta feature set\n", | |
| "select_cols = select_lags\n", | |
| "\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "# 9) Fit full XGB → compute Global SHAP\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "model_full = XGBRegressor(max_depth=3, n_estimators=200,\n", | |
| " random_state=42, verbosity=0)\n", | |
| "model_full.fit(X_train[select_cols], y_train, sample_weight=w_train)\n", | |
| "\n", | |
| "explainer = shap.TreeExplainer(model_full)\n", | |
| "shap_vals = explainer.shap_values(X_train[select_cols])\n", | |
| "global_shap = pd.Series(np.abs(shap_vals).mean(axis=0),\n", | |
| " index=select_cols, name='Global SHAP')\n", | |
| "\n", | |
| "# Beeswarm plot\n", | |
| "shap.summary_plot(shap_vals, X_train[select_cols], plot_type='dot')\n", | |
| "plt.show()\n", | |
| "\n", | |
| "# Top 5 SHAP interaction pairs\n", | |
| "inter = explainer.shap_interaction_values(X_train[select_cols])\n", | |
| "mean_int = np.abs(inter).mean(axis=0)\n", | |
| "pairs = [\n", | |
| " (select_cols[i], select_cols[j], mean_int[i,j])\n", | |
| " for i in range(len(select_cols)) for j in range(i+1, len(select_cols))\n", | |
| "]\n", | |
| "print(\"Top 5 SHAP interactions:\")\n", | |
| "for f1,f2,val in sorted(pairs, key=lambda x: x[2], reverse=True)[:15]:\n", | |
| " print(f\" {f1:30} {f2:30} {val:.4f}\")\n", | |
| "\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "# 10) Boruta-Shap on TRAIN only (weighted by route_run)\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "opt = BorutaShap(\n", | |
| " model=model_full,\n", | |
| " importance_measure='shap',\n", | |
| " classification=False\n", | |
| ")\n", | |
| "opt.fit(\n", | |
| " X=X_train[select_cols],\n", | |
| " y=y_train,\n", | |
| " sample_weight=w_train,\n", | |
| " n_trials=100,\n", | |
| " train_or_test='test',\n", | |
| " sample=False,\n", | |
| " normalize=True,\n", | |
| " verbose=True\n", | |
| ")\n", | |
| "\n", | |
| "# Boxplot of Z‐scores\n", | |
| "opt.plot(which_features='all'); plt.show()\n", | |
| "\n", | |
| "accepted = set(opt.accepted)\n", | |
| "print(\"Boruta-Shap accepted:\", accepted)\n", | |
| "\n", | |
| "display_map = {\n", | |
| " 'fantasy_ppr_per_game': 'PPR',\n", | |
| " 'target_rprr': 'Target %',\n", | |
| " 'first_down_rprr': '1D/RR',\n", | |
| " 'target_zone_pct': 'Zone Target %',\n", | |
| " 'target_man_pct': 'Man Target %',\n", | |
| " 'first_down_zone_pct': 'Zone 1D %',\n", | |
| " 'first_down_man_pct': 'Man 1D %',\n", | |
| " 'rec_yards_rprr': 'YPRR',\n", | |
| " 'oe_rec_yards_rprr': 'YPRR OE',\n", | |
| " 'air_yards_rprr': 'Air Yards/RR',\n", | |
| " 'air_yards_zone_pct': 'Zone Air Yards %',\n", | |
| " 'air_yards_man_pct': 'Man Air Yards %',\n", | |
| " 'yac_yards_rprr': 'YAC/RR',\n", | |
| " 'oe_yac_rprr': 'YAC OE/RR',\n", | |
| " 'yac_yards_zone_pct': 'Zone YAC %',\n", | |
| " 'yac_yards_man_pct': 'Man YAC %'\n", | |
| "}\n", | |
| "\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "# 11) Build summary table and reindex before join\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "hist = opt.history_x.copy()\n", | |
| "hist.drop(columns=['Max_Shadow','Min_Shadow','Mean_Shadow','Median_Shadow'],\n", | |
| " errors='ignore', inplace=True)\n", | |
| "\n", | |
| "# average Z across trials (drop the initial zero-row)\n", | |
| "mean_z = hist.iloc[1:].mean(axis=0)\n", | |
| "\n", | |
| "# strip “_lag” suffix & map to display names\n", | |
| "metrics_stripped = [col.replace('_lag','') for col in mean_z.index]\n", | |
| "display_index = [display_map.get(m, m) for m in metrics_stripped]\n", | |
| "\n", | |
| "meta = pd.DataFrame({\n", | |
| " 'Global SHAP': global_shap.values,\n", | |
| " 'Mean Z': mean_z.values,\n", | |
| " 'Boruta': [\n", | |
| " 'Accepted' if m in accepted\n", | |
| " else 'Rejected' if m in opt.rejected\n", | |
| " else 'Tentative'\n", | |
| " for m in metrics_stripped\n", | |
| " ]\n", | |
| "}, index=display_index)\n", | |
| "\n", | |
| "# reindex to match df_final’s index order, then join\n", | |
| "meta = meta.reindex(df_final.index)\n", | |
| "full_table = df_final.join(meta, how='left')\n", | |
| "display(full_table)\n", | |
| "\n", | |
| "\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "# 12) Final retrain on accepted lag‐features + age → evaluate on 2024\n", | |
| "# ─────────────────────────────────────────────────────────────────────────────\n", | |
| "# only include valid lag columns\n", | |
| "final_feats = [\n", | |
| " f\"{f}_lag\" for f in mean_z.index\n", | |
| " if f in accepted and f+\"_lag\" in X_train.columns\n", | |
| "] + ['age']\n", | |
| "\n", | |
| "model_final = XGBRegressor(max_depth=3, n_estimators=200,\n", | |
| " random_state=42, verbosity=0)\n", | |
| "model_final.fit(X_train[final_feats], y_train, sample_weight=w_train)\n", | |
| "\n", | |
| "y_pred = model_final.predict(X_test[final_feats])\n", | |
| "print(\"Hold-out R² =\", r2_score(y_test, y_pred))\n", | |
| "print(\"Hold-out RMSE =\", root_mean_squared_error(y_test, y_pred))\n", | |
| "\n", | |
| "\n", | |
| "\n" | |
| ], | |
| "metadata": { | |
| "id": "CoQL7xF_C-V2" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "display(full_table.sort_values('Mean Z', ascending=False))" | |
| ], | |
| "metadata": { | |
| "id": "LQmNQKWuX50s" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Top 5 SHAP interaction pairs\n", | |
| "inter = explainer.shap_interaction_values(X_train[select_cols])\n", | |
| "mean_int = np.abs(inter).mean(axis=0)\n", | |
| "pairs = [\n", | |
| " (select_cols[i], select_cols[j], mean_int[i,j])\n", | |
| " for i in range(len(select_cols)) for j in range(i+1, len(select_cols))\n", | |
| "]\n", | |
| "print(\"Top 15 SHAP interactions:\")\n", | |
| "for f1,f2,val in sorted(pairs, key=lambda x: x[2], reverse=True)[:15]:\n", | |
| " print(f\" {f1:30} {f2:30} {val:.4f}\")" | |
| ], | |
| "metadata": { | |
| "id": "A2eLoYJe7UHs" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from BorutaShap import BorutaShap\n", | |
| "from xgboost import XGBRegressor\n", | |
| "from sklearn.impute import KNNImputer\n", | |
| "from sklearn.preprocessing import MinMaxScaler\n", | |
| "from sklearn.metrics import r2_score, mean_squared_error\n", | |
| "import shap\n", | |
| "\n", | |
| "# 0) ws must already include your one-year lags *_lag, your bio columns,\n", | |
| "# and ppr_next = next season’s fantasy_ppr_per_game.\n", | |
| "# e.g. ws[['fantasy_ppr_per_game_lag', …, 'draft_pick','age','height','weight','ppr_next','season']]\n", | |
| "\n", | |
| "# 1) Prepare X/y and drop rows without a next-year target\n", | |
| "lag_cols = [c for c in ws.columns if c.endswith('_lag')]\n", | |
| "bio_cols = ['draft_pick','age','height','weight']\n", | |
| "df = ws.dropna(subset=['fantasy_ppr_per_game']).copy()\n", | |
| "X = df[lag_cols + bio_cols]\n", | |
| "y = df['fantasy_ppr_per_game']\n", | |
| "\n", | |
| "# 2) Impute all missing features with KNN\n", | |
| "imputer = KNNImputer(n_neighbors=5, weights='distance')\n", | |
| "X_imputed = pd.DataFrame(imputer.fit_transform(X),\n", | |
| " columns=X.columns, index=X.index)\n", | |
| "\n", | |
| "# 3) Scale everything to [0,1] for consistency\n", | |
| "scaler = MinMaxScaler()\n", | |
| "X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed),\n", | |
| " columns=X.columns, index=X.index)\n", | |
| "\n", | |
| "# 4) Chronological train/test split\n", | |
| "is_train = df['season'] < 2024\n", | |
| "X_train, y_train = X_scaled[is_train], y[is_train]\n", | |
| "X_test, y_test = X_scaled[~is_train], y[~is_train]\n", | |
| "\n", | |
| "# 5) Boruta-Shap on the TRAIN set only\n", | |
| "opt = BorutaShap(\n", | |
| " model=XGBRegressor(max_depth=3, n_estimators=200, random_state=42, verbosity=0),\n", | |
| " importance_measure='shap',\n", | |
| " classification=False\n", | |
| ")\n", | |
| "opt.fit(\n", | |
| " X=X_train, y=y_train,\n", | |
| " n_trials=100,\n", | |
| " train_or_test='test',\n", | |
| " sample=False,\n", | |
| " normalize=True,\n", | |
| " verbose=True\n", | |
| ")\n", | |
| "selected = opt.accepted\n", | |
| "print(\"Selected features:\", selected)\n", | |
| "\n", | |
| "# 6) Fit a final XGB on just those features and evaluate on 2024\n", | |
| "model_final = XGBRegressor(max_depth=3, n_estimators=200, random_state=42, verbosity=0)\n", | |
| "model_final.fit(X_train[selected], y_train)\n", | |
| "y_pred = model_final.predict(X_test[selected])\n", | |
| "\n", | |
| "print(\"Test R² =\", r2_score(y_test, y_pred))" | |
| ], | |
| "metadata": { | |
| "id": "h_wyLn_iDZP9" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import re\n", | |
| "import requests\n", | |
| "from sklearn.preprocessing import MinMaxScaler\n", | |
| "\n", | |
| "# 1) Isolate 2024 WRs with the three lagged Boruta–SHAP features\n", | |
| "s24 = (\n", | |
| " ws\n", | |
| " .query(\"season == 2024 and position == 'WR'\")\n", | |
| " .dropna(subset=[\n", | |
| " 'sleeper_id',\n", | |
| " 'fantasy_ppr_per_game',\n", | |
| " 'yac_yards_rprr',\n", | |
| " 'oe_rec_yards_rprr',\n", | |
| " 'rec_yards_rprr'\n", | |
| " ])\n", | |
| " [['name','sleeper_id',\n", | |
| " 'fantasy_ppr_per_game',\n", | |
| " 'yac_yards_rprr',\n", | |
| " 'oe_rec_yards_rprr',\n", | |
| " 'rec_yards_rprr']]\n", | |
| " .reset_index(drop=True)\n", | |
| ")\n", | |
| "s24['sleeper_id'] = s24['sleeper_id'].astype(float)\n", | |
| "\n", | |
| "# 2) Normalize each lag metric to [0,1] and build a composite score\n", | |
| "scaler = MinMaxScaler()\n", | |
| "s24[['ppr_n','yac_n','oe_n', 'yprr_n']] = scaler.fit_transform(\n", | |
| " s24[[\n", | |
| " 'fantasy_ppr_per_game',\n", | |
| " 'yac_yards_rprr',\n", | |
| " 'oe_rec_yards_rprr',\n", | |
| " 'rec_yards_rprr'\n", | |
| " ]]\n", | |
| ")\n", | |
| "s24['composite'] = s24[['ppr_n','yac_n','oe_n','yprr_n']].mean(axis=1)\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "# 3) Fetch 2025 PPR ADP (include the player’s name)\n", | |
| "# https://www.fantasydatapros.com/fantasyfootball/blog/fantasycalc/1\n", | |
| "def fetch_adp(year):\n", | |
| " url = f\"https://fantasyfootballcalculator.com/api/v1/adp/ppr?teams=12&year={year}\"\n", | |
| " data = requests.get(url).json()\n", | |
| " recs = data.get(\"players\") if isinstance(data, dict) else data\n", | |
| " return pd.DataFrame([\n", | |
| " {\n", | |
| " \"player_id\": r[\"player_id\"],\n", | |
| " \"name\": r.get(\"name\") or r.get(\"player_name\"),\n", | |
| " \"adp\": r[\"adp\"]\n", | |
| " }\n", | |
| " for r in (recs or [])\n", | |
| " ])\n", | |
| "\n", | |
| "adp25 = fetch_adp(2025)\n", | |
| "\n", | |
| "# 4) build a “merge key” by cleaning & truncating names\n", | |
| "def make_key(n):\n", | |
| " s = n.lower()\n", | |
| " # remove Jr/Sr/II/III/etc and punctuation\n", | |
| " s = re.sub(r\"\\b(jr|sr|ii|iii|iv)\\b\", \"\", s)\n", | |
| " s = re.sub(r\"[^\\w\\s]\", \"\", s).strip()\n", | |
| " parts = s.split()\n", | |
| " if len(parts) >= 2:\n", | |
| " first, last = parts[0], parts[-1]\n", | |
| " else:\n", | |
| " first, last = parts[0], \"\"\n", | |
| " return first[:3] + last\n", | |
| "\n", | |
| "# add our merge‐keys\n", | |
| "s24[\"key\"] = s24[\"name\"].map(make_key)\n", | |
| "adp25[\"key\"] = adp25[\"name\"].map(make_key)\n", | |
| "\n", | |
| "# 5) merge on that key and drop any non-matches\n", | |
| "df = (\n", | |
| " s24\n", | |
| " .merge(adp25[[\"key\",\"adp\"]], on=\"key\", how=\"left\")\n", | |
| " .dropna(subset=[\"adp\"])\n", | |
| ")\n", | |
| "\n", | |
| "# 6) compute value_score = normalized composite – normalized ADP\n", | |
| "df[\"adp_norm\"] = MinMaxScaler().fit_transform(df[[\"adp\"]]).ravel()\n", | |
| "df[\"value_score\"] = df[\"composite\"] - df[\"adp_norm\"]\n", | |
| "\n", | |
| "# 7) pick top 5 undervalued WRs by value_score\n", | |
| "targets = (\n", | |
| " df\n", | |
| " .sort_values(\"value_score\", ascending=False)\n", | |
| " .head(5)\n", | |
| " .loc[:, [\n", | |
| " \"name\",\n", | |
| " \"fantasy_ppr_per_game\",\n", | |
| " \"yac_yards_rprr\",\n", | |
| " \"oe_rec_yards_rprr\",\n", | |
| " \"rec_yards_rprr\",\n", | |
| " \"adp\",\n", | |
| " \"value_score\"\n", | |
| " ]]\n", | |
| " .rename(columns={\n", | |
| " \"fantasy_ppr_per_game\": \"PPR\",\n", | |
| " \"yac_yards_rprr\": \"YAC/RR\",\n", | |
| " \"oe_rec_yards_rprr\": \"YPRR-OE\",\n", | |
| " \"rec_yards_rprr\" : \"YPRR\"\n", | |
| " })\n", | |
| " .reset_index(drop=True)\n", | |
| ")\n", | |
| "\n", | |
| "print(\"Top 5 undervalued WRs for 2025:\\n\", targets)" | |
| ], | |
| "metadata": { | |
| "id": "NihIdwYIX7O4" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# (5) compute each player’s 12-team draft round\n", | |
| "df['round'] = ((df['adp'] - 1) // 12 + 1).astype(int)\n", | |
| "\n", | |
| "# (6) bucket into Early/Mid/Late\n", | |
| "def segment(r):\n", | |
| " if r <= 3: return 'Early'\n", | |
| " if r <= 6: return 'Mid'\n", | |
| " return 'Late'\n", | |
| "\n", | |
| "df['segment'] = df['round'].apply(segment)\n", | |
| "\n", | |
| "# (7) select top-5 by value_score _within_ each bucket\n", | |
| "out = (\n", | |
| " df\n", | |
| " .sort_values('value_score', ascending=False)\n", | |
| " .groupby('segment', group_keys=False)\n", | |
| " .head(5)\n", | |
| " [['segment','round','name','fantasy_ppr_per_game',\n", | |
| " 'yac_yards_rprr','oe_rec_yards_rprr','rec_yards_rprr',\n", | |
| " 'adp','value_score']]\n", | |
| " .rename(columns={\n", | |
| " 'fantasy_ppr_per_game':'PPR',\n", | |
| " 'yac_yards_rprr': 'YAC/RR',\n", | |
| " 'oe_rec_yards_rprr': 'YPRR-OE',\n", | |
| " \"rec_yards_rprr\": \"YPRR\" })\n", | |
| " .reset_index(drop=True)\n", | |
| ").sort_values('value_score', ascending=False)\n", | |
| "\n", | |
| "display(\"Top 5 undervalued WRs by bucket:\\n\", out)" | |
| ], | |
| "metadata": { | |
| "id": "SuQA-vFysroV" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Next Steps\n", | |
| "- Add more/better data for man/zone, route participation, and age\n", | |
| "- Run analysis for other positions (RB, QB, TE -- KICKER!?, DEFENSE!?)\n", | |
| "- Figure out a better way to filter out low participation players, or decide to keep them and weight appropriately?" | |
| ], | |
| "metadata": { | |
| "id": "LL-74uVZZdDz" | |
| } | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment