diamonaj/loess.R

## loess.R

## Step 1: Load the Training Data

#Download and inspect the training set using `read.csv()` from the URL provided:

# **Training data link:**

training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")
# View the first few rows using head()
head(training)

############################################################

## Step 2: Explore the Training Data

#Visualize the relationship between the predictor `x` and the outcome `y`.

plot(training$x, training$y)
# Customize your plot with labels and a title


plot(training$x, training$y, main = "Scatter plot of training data",
     xlab = "x", ylab = "y")

# What kind of pattern do you observe? Linear? Curved? No pattern?

############################################################

## Step 3: Fit Models to the Training Data

#Fit the following models:
#- A **loess model** with a large span (e.g., span = 1)
#- A **loess model** with a small span (e.g., span = 0.2)
#- A **linear model** using `lm()`

#Hint: Use `loess()` and `lm()` functions.

# Now fit fit_smallspan and reg1
fit_bigspan <- loess(y ~ x, data = training, span = 1)
fit_smallspan <- loess(y ~ x, data = training, span = 0.2)
fit_lm <- lm(y ~ x, data = training)

## Step 4: Predict on the Training Set

#Use `predict()` to get fitted values from each model on the training data.

predicted_big <- predict(fit_bigspan)


# Get predictions from other models
predicted_small <- predict(fit_smallspan)
predicted_lm <- predict(fit_lm)

############################################################

## Step 4: Visualize Predictions on Training Set

#Overlay the model predictions on the training scatterplot.

plot(training$x, training$y, main = "Training Data with Model Fits", xlab = "x", ylab = "y")
# Overlay model predictions and add a legend
points(training$x, predicted_big, col = "blue", pch = 16, cex = 3)
points(training$x, predicted_small, col = "red", pch = 16)
points(training$x, predicted_lm, col = "orange", pch = 16)

legend("topright", legend = c("losses span = 1", "losses span = 0.2", "linear"), col = c("blue", "red", "orange"), pch = 16)


############################################################

## Step 5: Calculate Training MSE

#Define a function to calculate Mean Squared Error (MSE), and compute training MSE for each model.


mse_big <- function(actual, predicted_big) { mean((actual - predicted_big)^2) }
mse_small <- function(actual, predicted_small) { mean((actual - predicted_small)^2) }
mse_lm <- function(actual, predicted_lm) { mean((actual - predicted_lm)^2) }

#mse_big_train <- mse_big(training$y, predicted

# Calculate MSEs for each model on the training set
cat("predicted_big=", mse_big(training$y, predicted_big), "\n\n")

cat("predicted_small=", mse_small(training$y, predicted_small), "\n\n")

cat("predicted_lm=", mse_lm(training$y, predicted_lm), "\n\n")

############################################################

## Step 6: Load and Clean the Test Data

#Download the test set:

#**Test data link:**
# https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv

#Important: **Before predicting on the test set, remove any rows where the test `x` values are outside the range of the training `x` values.**

#Why? Because loess models cannot reliably extrapolate beyond the domain of the training data. Loess predictions outside that range may be undefined or misleading.

test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

# Filter out test rows where x is outside the training x range
x_min <- min(training$x)
x_max <- max(training$x)
test <- test[test$x >= x_min & test$x <= x_max, ]

############################################################

## Step 7: Predict on the Test Set

#Use `predict()` to generate predictions on the filtered TEST SET.

predicted_test_big <- predict(fit_bigspan, newdata = test)
predicted_test_small <- predict(fit_smallspan, newdata = test)
predicted_test_lm <- predict(fit_lm, newdata = test)

############################################################

## Step 9: Calculate Test MSE

# Compute the MSE for each model using the predictions and actual `y` values in the test set.

mse_big_test <- mse_big(test$y, predicted_test_big)
mse_small_test <- mse_small(test$y, predicted_test_small)
mse_lm_test <- mse_lm(test$y, predicted_test_lm)

# Compute MSE for the other models
cat("predicted_test_big=", mse_big_test, "\n\n")
cat("predicted_small_big=", mse_small_test, "\n\n")
cat("predicted_lm_big=", mse_lm_test, "\n\n")

############################################################

# by the way, here is the test-set data:

test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

# and here it is sorted, using a copy called "test2":
# here is the order of the x variable in this test data
in_order <- order(test$x)
print(in_order)

# rearrange the ROWS in this order and call the dataframe "test2"
test2 <- test[in_order,]

############################################################

## Step 10: Visualize Predictions on Test Set

#Compare model predictions visually against the test data.
predicted_test_big2   <- predicted_test_big[in_order]
predicted_test_small2 <- predicted_test_small[in_order]
predicted_test_lm2    <- predicted_test_lm[in_order]

plot(test2$x, test2$y, main = "Model Predictions on Test Data", xlab = "x", ylab = "y")
points(test2$x, predicted_test_big2, col = "red", pch = 16)
points(test2$x, predicted_test_small2, col = "darkgreen", pch = 5, cex = 1.2)
points(test2$x, predicted_test_lm2, col = "blue", pch = 16, cex = 2)
legend("topright", legend = c("Loess span=1", "Loess span=0.2", "Linear Model"),
       col = c("red", "green", "blue"), pch = 16)

############################################################

## Discussion Questions

#1. Which model performed best on the **training** set? Why?

#2. Which model performed best on the **test** set? Why?

#3. What do these results suggest about **overfitting** and **model flexibility**?

#4. Why might we prefer a simpler or smoother model even if it doesn’t minimize training error?

	## Step 1: Load the Training Data

	#Download and inspect the training set using `read.csv()` from the URL provided:

	# Training data link:

	training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")
	# View the first few rows using head()
	head(training)

	############################################################

	## Step 2: Explore the Training Data

	#Visualize the relationship between the predictor `x` and the outcome `y`.

	plot(training$x, training$y)
	# Customize your plot with labels and a title



	plot(training$x, training$y, main = "Scatter plot of training data",
	xlab = "x", ylab = "y")

	# What kind of pattern do you observe? Linear? Curved? No pattern?

	############################################################

	## Step 3: Fit Models to the Training Data

	#Fit the following models:
	#- A loess model with a large span (e.g., span = 1)
	#- A loess model with a small span (e.g., span = 0.2)
	#- A linear model using `lm()`

	#Hint: Use `loess()` and `lm()` functions.

	# Now fit fit_smallspan and reg1
	fit_bigspan <- loess(y ~ x, data = training, span = 1)
	fit_smallspan <- loess(y ~ x, data = training, span = 0.2)
	fit_lm <- lm(y ~ x, data = training)

	## Step 4: Predict on the Training Set

	#Use `predict()` to get fitted values from each model on the training data.

	predicted_big <- predict(fit_bigspan)


	# Get predictions from other models
	predicted_small <- predict(fit_smallspan)
	predicted_lm <- predict(fit_lm)

	############################################################

	## Step 4: Visualize Predictions on Training Set

	#Overlay the model predictions on the training scatterplot.

	plot(training$x, training$y, main = "Training Data with Model Fits", xlab = "x", ylab = "y")
	# Overlay model predictions and add a legend
	points(training$x, predicted_big, col = "blue", pch = 16, cex = 3)
	points(training$x, predicted_small, col = "red", pch = 16)
	points(training$x, predicted_lm, col = "orange", pch = 16)

	legend("topright", legend = c("losses span = 1", "losses span = 0.2", "linear"), col = c("blue", "red", "orange"), pch = 16)


	############################################################

	## Step 5: Calculate Training MSE

	#Define a function to calculate Mean Squared Error (MSE), and compute training MSE for each model.


	mse_big <- function(actual, predicted_big) { mean((actual - predicted_big)^2) }
	mse_small <- function(actual, predicted_small) { mean((actual - predicted_small)^2) }
	mse_lm <- function(actual, predicted_lm) { mean((actual - predicted_lm)^2) }

	#mse_big_train <- mse_big(training$y, predicted

	# Calculate MSEs for each model on the training set
	cat("predicted_big=", mse_big(training$y, predicted_big), "\n\n")

	cat("predicted_small=", mse_small(training$y, predicted_small), "\n\n")

	cat("predicted_lm=", mse_lm(training$y, predicted_lm), "\n\n")

	############################################################

	## Step 6: Load and Clean the Test Data

	#Download the test set:

	#Test data link:
	# https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv

	#Important: Before predicting on the test set, remove any rows where the test `x` values are outside the range of the training `x` values.

	#Why? Because loess models cannot reliably extrapolate beyond the domain of the training data. Loess predictions outside that range may be undefined or misleading.

	test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

	# Filter out test rows where x is outside the training x range
	x_min <- min(training$x)
	x_max <- max(training$x)
	test <- test[test$x >= x_min & test$x <= x_max, ]

	############################################################

	## Step 7: Predict on the Test Set

	#Use `predict()` to generate predictions on the filtered TEST SET.

	predicted_test_big <- predict(fit_bigspan, newdata = test)
	predicted_test_small <- predict(fit_smallspan, newdata = test)
	predicted_test_lm <- predict(fit_lm, newdata = test)

	############################################################

	## Step 9: Calculate Test MSE

	# Compute the MSE for each model using the predictions and actual `y` values in the test set.

	mse_big_test <- mse_big(test$y, predicted_test_big)
	mse_small_test <- mse_small(test$y, predicted_test_small)
	mse_lm_test <- mse_lm(test$y, predicted_test_lm)

	# Compute MSE for the other models
	cat("predicted_test_big=", mse_big_test, "\n\n")
	cat("predicted_small_big=", mse_small_test, "\n\n")
	cat("predicted_lm_big=", mse_lm_test, "\n\n")

	############################################################

	# by the way, here is the test-set data:

	test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

	# and here it is sorted, using a copy called "test2":
	# here is the order of the x variable in this test data
	in_order <- order(test$x)
	print(in_order)

	# rearrange the ROWS in this order and call the dataframe "test2"
	test2 <- test[in_order,]

	############################################################

	## Step 10: Visualize Predictions on Test Set

	#Compare model predictions visually against the test data.
	predicted_test_big2 <- predicted_test_big[in_order]
	predicted_test_small2 <- predicted_test_small[in_order]
	predicted_test_lm2 <- predicted_test_lm[in_order]

	plot(test2$x, test2$y, main = "Model Predictions on Test Data", xlab = "x", ylab = "y")
	points(test2$x, predicted_test_big2, col = "red", pch = 16)
	points(test2$x, predicted_test_small2, col = "darkgreen", pch = 5, cex = 1.2)
	points(test2$x, predicted_test_lm2, col = "blue", pch = 16, cex = 2)
	legend("topright", legend = c("Loess span=1", "Loess span=0.2", "Linear Model"),
	col = c("red", "green", "blue"), pch = 16)

	############################################################

	## Discussion Questions

	#1. Which model performed best on the training set? Why?

	#2. Which model performed best on the test set? Why?

	#3. What do these results suggest about overfitting and model flexibility?

	#4. Why might we prefer a simpler or smoother model even if it doesn’t minimize training error?
No results found