Skip to content

Instantly share code, notes, and snippets.

@ssghost
Created July 16, 2025 04:36
Show Gist options
  • Select an option

  • Save ssghost/0df83755816d9afacbc26ed2c8a909c0 to your computer and use it in GitHub Desktop.

Select an option

Save ssghost/0df83755816d9afacbc26ed2c8a909c0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "t0kGpJIhciXH",
"outputId": "e390920e-8238-4994-9a25-aea6bf8f9784"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[-1.44444444 42.77777778 -1. 0.55555556 -0.77777778 9.44444444\n",
" -0.33333333 0.33333333 -1. 20.55555556]\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# Sample data with outliers\n",
"data = np.array([1, 200, 3, 10, 4, 50, 6, 9, 3, 100])\n",
"\n",
"# One-liner: Robust scaling using MAD\n",
"scaled = (data - np.median(data)) / np.median(np.abs(data - np.median(data)))\n",
"print(scaled)\n"
]
},
{
"cell_type": "code",
"source": [
"# Sample continuous data (e.g., customer ages)\n",
"ages = np.array([18, 25, 35, 22, 45, 67, 23, 29, 34, 56, 41, 38, 52, 28, 33])\n",
"\n",
"# One-liner: Create 4 equal-frequency bins\n",
"binned = np.digitize(ages, np.percentile(ages, [25, 50, 75])) - 1\n",
"print(binned)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MZ654h7ec34I",
"outputId": "144564d7-b48c-4bf2-c034-b131fbaeeefa"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[-1 -1 1 -1 2 2 -1 0 1 2 1 1 2 0 0]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Original features (e.g., temperature, humidity)\n",
"X = np.array([[20, 65], [25, 70], [30, 45], [22, 80]])\n",
"\n",
"# One-liner: Generate degree-2 polynomial features\n",
"poly_features = np.column_stack([X[:, [i, j]].prod(axis=1) for i in range(X.shape[1]) for j in range(i, X.shape[1])])\n",
"print(poly_features)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kzwwApnQc9pV",
"outputId": "11249482-96eb-47cc-db05-73bf1b620d27"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[[ 400 1300 4225]\n",
" [ 625 1750 4900]\n",
" [ 900 1350 2025]\n",
" [ 484 1760 6400]]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Time series data (e.g., daily sales)\n",
"sales = np.array([100, 98, 120,130, 74, 145, 110, 140, 65, 105, 135])\n",
"\n",
"lags = np.column_stack([np.roll(sales, shift) for shift in [1, 2, 3]])[3:]\n",
"print(lags)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5XTtCYpQdfML",
"outputId": "9bbe8c29-c1e2-4cbb-be7d-00b77bfe89b7"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[[120 98 100]\n",
" [130 120 98]\n",
" [ 74 130 120]\n",
" [145 74 130]\n",
" [110 145 74]\n",
" [140 110 145]\n",
" [ 65 140 110]\n",
" [105 65 140]]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Categorical data (e.g., product categories)\n",
"categories = np.array([0, 1, 2, 1, 0, 2, 3, 1])\n",
"\n",
"# One-liner: One-hot encode\n",
"one_hot = (categories[:, None] == np.arange(categories.max() + 1)).astype(int)\n",
"print(one_hot)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "R0HPVkzmdgvW",
"outputId": "a7a1e5b3-9dac-46c3-90bb-14915619a3dd"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[[1 0 0 0]\n",
" [0 1 0 0]\n",
" [0 0 1 0]\n",
" [0 1 0 0]\n",
" [1 0 0 0]\n",
" [0 0 1 0]\n",
" [0 0 0 1]\n",
" [0 1 0 0]]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Coordinate data\n",
"locations = np.array([[40.7128, -74.0060],\n",
" [34.0522, -118.2437],\n",
" [41.8781, -87.6298],\n",
" [29.7604, -95.3698]])\n",
"reference = np.array([39.7392, -104.9903])\n",
"\n",
"# One-liner: Calculate Euclidean distances from reference point\n",
"distances = np.sqrt(((locations - reference) ** 2).sum(axis=1))\n",
"print(distances)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "w0rszMzidyjs",
"outputId": "8eb2ff56-f6a1-4b4a-b383-b59770133c49"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[30.99959263 14.42201722 17.4917653 13.86111358]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Sample features (e.g., price, quality, brand_score)\n",
"features = np.array([[10, 8, 7], [15, 9, 6], [12, 7, 8], [20, 10, 9]])\n",
"\n",
"# One-liner: Create all pairwise interactions\n",
"interactions = np.array([features[:, i] * features[:, j]\n",
" for i in range(features.shape[1])\n",
" for j in range(i+1, features.shape[1])]).T\n",
"print(interactions)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0o8FYjTZd7ey",
"outputId": "cd8ce3c1-309b-4c75-d0b6-ac7f602d02fb"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[[ 80 70 56]\n",
" [135 90 54]\n",
" [ 84 96 56]\n",
" [200 180 90]]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Noisy signal data (e.g., stock prices, sensor readings)\n",
"signal = np.array([10, 27, 12, 18, 11, 19, 20, 26, 12, 19, 25, 31, 28])\n",
"window_size = 4\n",
"\n",
"# One-liner: Create rolling mean features\n",
"rolling_mean = np.convolve(signal, np.ones(window_size)/window_size, mode='valid')\n",
"print(rolling_mean)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "amZgqlEZeAOI",
"outputId": "fa48d13f-f98b-43b9-8dd0-a1a66c9c7557"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[16.75 17. 15. 17. 19. 19.25 19.25 20.5 21.75 25.75]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Data with potential outliers (e.g., transaction amounts)\n",
"amounts = np.array([25, 30, 28, 32, 500, 29, 31, 27, 33, 26])\n",
"\n",
"# One-liner: Create outlier indicator features\n",
"outlier_flags = ((amounts < np.percentile(amounts, 5)) |\n",
" (amounts > np.percentile(amounts, 95))).astype(int)\n",
"print(outlier_flags)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OiHHeWgreq21",
"outputId": "80db2904-677c-4eee-f6b1-2dedadabf2b5"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[1 0 0 0 1 0 0 0 0 0]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Categorical data (e.g., product categories)\n",
"categories = np.array(['Electronics', 'Books', 'Electronics', 'Clothing',\n",
" 'Books', 'Electronics', 'Home', 'Books'])\n",
"\n",
"# One-liner: Frequency encode\n",
"unique_cats, counts = np.unique(categories, return_counts=True)\n",
"freq_encoded = np.array([counts[np.where(unique_cats == cat)[0][0]] for cat in categories])\n",
"print(freq_encoded)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Nor6f_t-ezhk",
"outputId": "1d142d36-f381-4f1a-eff8-162b6b2c5bd1"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[3 3 3 1 3 3 1 3]\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment