Created
September 18, 2025 17:53
-
-
Save diamonaj/4028289dabb06b0292a594be3fa5d003 to your computer and use it in GitHub Desktop.
CS130 Lesson 3 loess
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Step 1: Load the Training Data | |
| #Download and inspect the training set using `read.csv()` from the URL provided: | |
| # **Training data link:** | |
| training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv") | |
| # View the first few rows using head() | |
| head(training) | |
| ############################################################ | |
| ## Step 2: Explore the Training Data | |
| #Visualize the relationship between the predictor `x` and the outcome `y`. | |
| plot(training$x, training$y) | |
| # Customize your plot with labels and a title | |
| plot(training$x, training$y, main = "Scatter plot of training data", | |
| xlab = "x", ylab = "y") | |
| # What kind of pattern do you observe? Linear? Curved? No pattern? | |
| ############################################################ | |
| ## Step 3: Fit Models to the Training Data | |
| #Fit the following models: | |
| #- A **loess model** with a large span (e.g., span = 1) | |
| #- A **loess model** with a small span (e.g., span = 0.2) | |
| #- A **linear model** using `lm()` | |
| #Hint: Use `loess()` and `lm()` functions. | |
| # Now fit fit_smallspan and reg1 | |
| fit_bigspan <- loess(y ~ x, data = training, span = 1) | |
| fit_smallspan <- loess(y ~ x, data = training, span = 0.2) | |
| fit_lm <- lm(y ~ x, data = training) | |
| ## Step 4: Predict on the Training Set | |
| #Use `predict()` to get fitted values from each model on the training data. | |
| predicted_big <- predict(fit_bigspan) | |
| # Get predictions from other models | |
| predicted_small <- predict(fit_smallspan) | |
| predicted_lm <- predict(fit_lm) | |
| ############################################################ | |
| ## Step 4: Visualize Predictions on Training Set | |
| #Overlay the model predictions on the training scatterplot. | |
| plot(training$x, training$y, main = "Training Data with Model Fits", xlab = "x", ylab = "y") | |
| # Overlay model predictions and add a legend | |
| points(training$x, predicted_big, col = "blue", pch = 16, cex = 3) | |
| points(training$x, predicted_small, col = "red", pch = 16) | |
| points(training$x, predicted_lm, col = "orange", pch = 16) | |
| legend("topright", legend = c("losses span = 1", "losses span = 0.2", "linear"), col = c("blue", "red", "orange"), pch = 16) | |
| ############################################################ | |
| ## Step 5: Calculate Training MSE | |
| #Define a function to calculate Mean Squared Error (MSE), and compute training MSE for each model. | |
| mse_big <- function(actual, predicted_big) { mean((actual - predicted_big)^2) } | |
| mse_small <- function(actual, predicted_small) { mean((actual - predicted_small)^2) } | |
| mse_lm <- function(actual, predicted_lm) { mean((actual - predicted_lm)^2) } | |
| #mse_big_train <- mse_big(training$y, predicted | |
| # Calculate MSEs for each model on the training set | |
| cat("predicted_big=", mse_big(training$y, predicted_big), "\n\n") | |
| cat("predicted_small=", mse_small(training$y, predicted_small), "\n\n") | |
| cat("predicted_lm=", mse_lm(training$y, predicted_lm), "\n\n") | |
| ############################################################ | |
| ## Step 6: Load and Clean the Test Data | |
| #Download the test set: | |
| #**Test data link:** | |
| # https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv | |
| #Important: **Before predicting on the test set, remove any rows where the test `x` values are outside the range of the training `x` values.** | |
| #Why? Because loess models cannot reliably extrapolate beyond the domain of the training data. Loess predictions outside that range may be undefined or misleading. | |
| test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv") | |
| # Filter out test rows where x is outside the training x range | |
| x_min <- min(training$x) | |
| x_max <- max(training$x) | |
| test <- test[test$x >= x_min & test$x <= x_max, ] | |
| ############################################################ | |
| ## Step 7: Predict on the Test Set | |
| #Use `predict()` to generate predictions on the filtered TEST SET. | |
| predicted_test_big <- predict(fit_bigspan, newdata = test) | |
| predicted_test_small <- predict(fit_smallspan, newdata = test) | |
| predicted_test_lm <- predict(fit_lm, newdata = test) | |
| ############################################################ | |
| ## Step 9: Calculate Test MSE | |
| # Compute the MSE for each model using the predictions and actual `y` values in the test set. | |
| mse_big_test <- mse_big(test$y, predicted_test_big) | |
| mse_small_test <- mse_small(test$y, predicted_test_small) | |
| mse_lm_test <- mse_lm(test$y, predicted_test_lm) | |
| # Compute MSE for the other models | |
| cat("predicted_test_big=", mse_big_test, "\n\n") | |
| cat("predicted_small_big=", mse_small_test, "\n\n") | |
| cat("predicted_lm_big=", mse_lm_test, "\n\n") | |
| ############################################################ | |
| # by the way, here is the test-set data: | |
| test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv") | |
| # and here it is sorted, using a copy called "test2": | |
| # here is the order of the x variable in this test data | |
| in_order <- order(test$x) | |
| print(in_order) | |
| # rearrange the ROWS in this order and call the dataframe "test2" | |
| test2 <- test[in_order,] | |
| ############################################################ | |
| ## Step 10: Visualize Predictions on Test Set | |
| #Compare model predictions visually against the test data. | |
| predicted_test_big2 <- predicted_test_big[in_order] | |
| predicted_test_small2 <- predicted_test_small[in_order] | |
| predicted_test_lm2 <- predicted_test_lm[in_order] | |
| plot(test2$x, test2$y, main = "Model Predictions on Test Data", xlab = "x", ylab = "y") | |
| points(test2$x, predicted_test_big2, col = "red", pch = 16) | |
| points(test2$x, predicted_test_small2, col = "darkgreen", pch = 5, cex = 1.2) | |
| points(test2$x, predicted_test_lm2, col = "blue", pch = 16, cex = 2) | |
| legend("topright", legend = c("Loess span=1", "Loess span=0.2", "Linear Model"), | |
| col = c("red", "green", "blue"), pch = 16) | |
| ############################################################ | |
| ## Discussion Questions | |
| #1. Which model performed best on the **training** set? Why? | |
| #2. Which model performed best on the **test** set? Why? | |
| #3. What do these results suggest about **overfitting** and **model flexibility**? | |
| #4. Why might we prefer a simpler or smoother model even if it doesn’t minimize training error? | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment