Created
July 16, 2025 04:36
-
-
Save ssghost/0df83755816d9afacbc26ed2c8a909c0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [] | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "t0kGpJIhciXH", | |
| "outputId": "e390920e-8238-4994-9a25-aea6bf8f9784" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[-1.44444444 42.77777778 -1. 0.55555556 -0.77777778 9.44444444\n", | |
| " -0.33333333 0.33333333 -1. 20.55555556]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import numpy as np\n", | |
| "\n", | |
| "# Sample data with outliers\n", | |
| "data = np.array([1, 200, 3, 10, 4, 50, 6, 9, 3, 100])\n", | |
| "\n", | |
| "# One-liner: Robust scaling using MAD\n", | |
| "scaled = (data - np.median(data)) / np.median(np.abs(data - np.median(data)))\n", | |
| "print(scaled)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Sample continuous data (e.g., customer ages)\n", | |
| "ages = np.array([18, 25, 35, 22, 45, 67, 23, 29, 34, 56, 41, 38, 52, 28, 33])\n", | |
| "\n", | |
| "# One-liner: Create 4 equal-frequency bins\n", | |
| "binned = np.digitize(ages, np.percentile(ages, [25, 50, 75])) - 1\n", | |
| "print(binned)\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "MZ654h7ec34I", | |
| "outputId": "144564d7-b48c-4bf2-c034-b131fbaeeefa" | |
| }, | |
| "execution_count": 2, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[-1 -1 1 -1 2 2 -1 0 1 2 1 1 2 0 0]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Original features (e.g., temperature, humidity)\n", | |
| "X = np.array([[20, 65], [25, 70], [30, 45], [22, 80]])\n", | |
| "\n", | |
| "# One-liner: Generate degree-2 polynomial features\n", | |
| "poly_features = np.column_stack([X[:, [i, j]].prod(axis=1) for i in range(X.shape[1]) for j in range(i, X.shape[1])])\n", | |
| "print(poly_features)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "kzwwApnQc9pV", | |
| "outputId": "11249482-96eb-47cc-db05-73bf1b620d27" | |
| }, | |
| "execution_count": 3, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[[ 400 1300 4225]\n", | |
| " [ 625 1750 4900]\n", | |
| " [ 900 1350 2025]\n", | |
| " [ 484 1760 6400]]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Time series data (e.g., daily sales)\n", | |
| "sales = np.array([100, 98, 120,130, 74, 145, 110, 140, 65, 105, 135])\n", | |
| "\n", | |
| "lags = np.column_stack([np.roll(sales, shift) for shift in [1, 2, 3]])[3:]\n", | |
| "print(lags)\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "5XTtCYpQdfML", | |
| "outputId": "9bbe8c29-c1e2-4cbb-be7d-00b77bfe89b7" | |
| }, | |
| "execution_count": 4, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[[120 98 100]\n", | |
| " [130 120 98]\n", | |
| " [ 74 130 120]\n", | |
| " [145 74 130]\n", | |
| " [110 145 74]\n", | |
| " [140 110 145]\n", | |
| " [ 65 140 110]\n", | |
| " [105 65 140]]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Categorical data (e.g., product categories)\n", | |
| "categories = np.array([0, 1, 2, 1, 0, 2, 3, 1])\n", | |
| "\n", | |
| "# One-liner: One-hot encode\n", | |
| "one_hot = (categories[:, None] == np.arange(categories.max() + 1)).astype(int)\n", | |
| "print(one_hot)\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "R0HPVkzmdgvW", | |
| "outputId": "a7a1e5b3-9dac-46c3-90bb-14915619a3dd" | |
| }, | |
| "execution_count": 5, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[[1 0 0 0]\n", | |
| " [0 1 0 0]\n", | |
| " [0 0 1 0]\n", | |
| " [0 1 0 0]\n", | |
| " [1 0 0 0]\n", | |
| " [0 0 1 0]\n", | |
| " [0 0 0 1]\n", | |
| " [0 1 0 0]]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Coordinate data\n", | |
| "locations = np.array([[40.7128, -74.0060],\n", | |
| " [34.0522, -118.2437],\n", | |
| " [41.8781, -87.6298],\n", | |
| " [29.7604, -95.3698]])\n", | |
| "reference = np.array([39.7392, -104.9903])\n", | |
| "\n", | |
| "# One-liner: Calculate Euclidean distances from reference point\n", | |
| "distances = np.sqrt(((locations - reference) ** 2).sum(axis=1))\n", | |
| "print(distances)\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "w0rszMzidyjs", | |
| "outputId": "8eb2ff56-f6a1-4b4a-b383-b59770133c49" | |
| }, | |
| "execution_count": 6, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[30.99959263 14.42201722 17.4917653 13.86111358]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Sample features (e.g., price, quality, brand_score)\n", | |
| "features = np.array([[10, 8, 7], [15, 9, 6], [12, 7, 8], [20, 10, 9]])\n", | |
| "\n", | |
| "# One-liner: Create all pairwise interactions\n", | |
| "interactions = np.array([features[:, i] * features[:, j]\n", | |
| " for i in range(features.shape[1])\n", | |
| " for j in range(i+1, features.shape[1])]).T\n", | |
| "print(interactions)\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "0o8FYjTZd7ey", | |
| "outputId": "cd8ce3c1-309b-4c75-d0b6-ac7f602d02fb" | |
| }, | |
| "execution_count": 7, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[[ 80 70 56]\n", | |
| " [135 90 54]\n", | |
| " [ 84 96 56]\n", | |
| " [200 180 90]]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Noisy signal data (e.g., stock prices, sensor readings)\n", | |
| "signal = np.array([10, 27, 12, 18, 11, 19, 20, 26, 12, 19, 25, 31, 28])\n", | |
| "window_size = 4\n", | |
| "\n", | |
| "# One-liner: Create rolling mean features\n", | |
| "rolling_mean = np.convolve(signal, np.ones(window_size)/window_size, mode='valid')\n", | |
| "print(rolling_mean)\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "amZgqlEZeAOI", | |
| "outputId": "fa48d13f-f98b-43b9-8dd0-a1a66c9c7557" | |
| }, | |
| "execution_count": 9, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[16.75 17. 15. 17. 19. 19.25 19.25 20.5 21.75 25.75]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Data with potential outliers (e.g., transaction amounts)\n", | |
| "amounts = np.array([25, 30, 28, 32, 500, 29, 31, 27, 33, 26])\n", | |
| "\n", | |
| "# One-liner: Create outlier indicator features\n", | |
| "outlier_flags = ((amounts < np.percentile(amounts, 5)) |\n", | |
| " (amounts > np.percentile(amounts, 95))).astype(int)\n", | |
| "print(outlier_flags)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "OiHHeWgreq21", | |
| "outputId": "80db2904-677c-4eee-f6b1-2dedadabf2b5" | |
| }, | |
| "execution_count": 10, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[1 0 0 0 1 0 0 0 0 0]\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Categorical data (e.g., product categories)\n", | |
| "categories = np.array(['Electronics', 'Books', 'Electronics', 'Clothing',\n", | |
| " 'Books', 'Electronics', 'Home', 'Books'])\n", | |
| "\n", | |
| "# One-liner: Frequency encode\n", | |
| "unique_cats, counts = np.unique(categories, return_counts=True)\n", | |
| "freq_encoded = np.array([counts[np.where(unique_cats == cat)[0][0]] for cat in categories])\n", | |
| "print(freq_encoded)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "Nor6f_t-ezhk", | |
| "outputId": "1d142d36-f381-4f1a-eff8-162b6b2c5bd1" | |
| }, | |
| "execution_count": 11, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "[3 3 3 1 3 3 1 3]\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment