Skip to content

Instantly share code, notes, and snippets.

@diamonaj
Created September 18, 2025 17:53
Show Gist options
  • Select an option

  • Save diamonaj/4028289dabb06b0292a594be3fa5d003 to your computer and use it in GitHub Desktop.

Select an option

Save diamonaj/4028289dabb06b0292a594be3fa5d003 to your computer and use it in GitHub Desktop.
CS130 Lesson 3 loess
## Step 1: Load the Training Data
#Download and inspect the training set using `read.csv()` from the URL provided:
# **Training data link:**
training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")
# View the first few rows using head()
head(training)
############################################################
## Step 2: Explore the Training Data
#Visualize the relationship between the predictor `x` and the outcome `y`.
plot(training$x, training$y)
# Customize your plot with labels and a title
plot(training$x, training$y, main = "Scatter plot of training data",
xlab = "x", ylab = "y")
# What kind of pattern do you observe? Linear? Curved? No pattern?
############################################################
## Step 3: Fit Models to the Training Data
#Fit the following models:
#- A **loess model** with a large span (e.g., span = 1)
#- A **loess model** with a small span (e.g., span = 0.2)
#- A **linear model** using `lm()`
#Hint: Use `loess()` and `lm()` functions.
# Now fit fit_smallspan and reg1
fit_bigspan <- loess(y ~ x, data = training, span = 1)
fit_smallspan <- loess(y ~ x, data = training, span = 0.2)
fit_lm <- lm(y ~ x, data = training)
## Step 4: Predict on the Training Set
#Use `predict()` to get fitted values from each model on the training data.
predicted_big <- predict(fit_bigspan)
# Get predictions from other models
predicted_small <- predict(fit_smallspan)
predicted_lm <- predict(fit_lm)
############################################################
## Step 4: Visualize Predictions on Training Set
#Overlay the model predictions on the training scatterplot.
plot(training$x, training$y, main = "Training Data with Model Fits", xlab = "x", ylab = "y")
# Overlay model predictions and add a legend
points(training$x, predicted_big, col = "blue", pch = 16, cex = 3)
points(training$x, predicted_small, col = "red", pch = 16)
points(training$x, predicted_lm, col = "orange", pch = 16)
legend("topright", legend = c("losses span = 1", "losses span = 0.2", "linear"), col = c("blue", "red", "orange"), pch = 16)
############################################################
## Step 5: Calculate Training MSE
#Define a function to calculate Mean Squared Error (MSE), and compute training MSE for each model.
mse_big <- function(actual, predicted_big) { mean((actual - predicted_big)^2) }
mse_small <- function(actual, predicted_small) { mean((actual - predicted_small)^2) }
mse_lm <- function(actual, predicted_lm) { mean((actual - predicted_lm)^2) }
#mse_big_train <- mse_big(training$y, predicted
# Calculate MSEs for each model on the training set
cat("predicted_big=", mse_big(training$y, predicted_big), "\n\n")
cat("predicted_small=", mse_small(training$y, predicted_small), "\n\n")
cat("predicted_lm=", mse_lm(training$y, predicted_lm), "\n\n")
############################################################
## Step 6: Load and Clean the Test Data
#Download the test set:
#**Test data link:**
# https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv
#Important: **Before predicting on the test set, remove any rows where the test `x` values are outside the range of the training `x` values.**
#Why? Because loess models cannot reliably extrapolate beyond the domain of the training data. Loess predictions outside that range may be undefined or misleading.
test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")
# Filter out test rows where x is outside the training x range
x_min <- min(training$x)
x_max <- max(training$x)
test <- test[test$x >= x_min & test$x <= x_max, ]
############################################################
## Step 7: Predict on the Test Set
#Use `predict()` to generate predictions on the filtered TEST SET.
predicted_test_big <- predict(fit_bigspan, newdata = test)
predicted_test_small <- predict(fit_smallspan, newdata = test)
predicted_test_lm <- predict(fit_lm, newdata = test)
############################################################
## Step 9: Calculate Test MSE
# Compute the MSE for each model using the predictions and actual `y` values in the test set.
mse_big_test <- mse_big(test$y, predicted_test_big)
mse_small_test <- mse_small(test$y, predicted_test_small)
mse_lm_test <- mse_lm(test$y, predicted_test_lm)
# Compute MSE for the other models
cat("predicted_test_big=", mse_big_test, "\n\n")
cat("predicted_small_big=", mse_small_test, "\n\n")
cat("predicted_lm_big=", mse_lm_test, "\n\n")
############################################################
# by the way, here is the test-set data:
test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")
# and here it is sorted, using a copy called "test2":
# here is the order of the x variable in this test data
in_order <- order(test$x)
print(in_order)
# rearrange the ROWS in this order and call the dataframe "test2"
test2 <- test[in_order,]
############################################################
## Step 10: Visualize Predictions on Test Set
#Compare model predictions visually against the test data.
predicted_test_big2 <- predicted_test_big[in_order]
predicted_test_small2 <- predicted_test_small[in_order]
predicted_test_lm2 <- predicted_test_lm[in_order]
plot(test2$x, test2$y, main = "Model Predictions on Test Data", xlab = "x", ylab = "y")
points(test2$x, predicted_test_big2, col = "red", pch = 16)
points(test2$x, predicted_test_small2, col = "darkgreen", pch = 5, cex = 1.2)
points(test2$x, predicted_test_lm2, col = "blue", pch = 16, cex = 2)
legend("topright", legend = c("Loess span=1", "Loess span=0.2", "Linear Model"),
col = c("red", "green", "blue"), pch = 16)
############################################################
## Discussion Questions
#1. Which model performed best on the **training** set? Why?
#2. Which model performed best on the **test** set? Why?
#3. What do these results suggest about **overfitting** and **model flexibility**?
#4. Why might we prefer a simpler or smoother model even if it doesn’t minimize training error?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment