Skip to content

Instantly share code, notes, and snippets.

@abikoushi
Created January 8, 2026 08:18
Show Gist options
  • Select an option

  • Save abikoushi/d986d60d4e1fdf13f02abb81fb405bbf to your computer and use it in GitHub Desktop.

Select an option

Save abikoushi/d986d60d4e1fdf13f02abb81fb405bbf to your computer and use it in GitHub Desktop.
冗長性のあるワンホットエンコーディングによるパラメータ化の係数をフルランクの計画行列の係数に変換する(切片項なし)
library(moltenNMF)
library(Matrix)
df = expand.grid(letters[1:3], LETTERS[1:2])
X = sparse_onehot(~., data = df)
A = matrix(c(1,0,0,0,
0,1,0,0,
0,0,1,0,
0,0,0,0,
0,0,0,1), byrow = TRUE, nrow = 5)
print(A)
Xs = sparse.model.matrix(~.-1, data = df)
print(all(X %*% A == Xs))
B = matrix(c(1,0,0,1,0,
0,1,0,1,0,
0,0,1,1,0,
0,0,0,-1,1), byrow = TRUE, nrow = 4)
print(B)
print(all(Xs %*% B == X))
clen = c(4, 2, 2)
full_to_onehot <- function(clen){
n_full = sum(clen - 1) + 1
n_onehot = sum(clen)
B = matrix(0, n_full, n_onehot)
B[1:clen[1],1:clen[1]] <- diag(clen[1])
cumlen <- cumsum(clen)
cumlen_shift <- cumsum(c(clen[1],clen[-1]-1) )
for(k in seq_len(length(cumlen)-1)){
B[1:cumlen_shift[1],cumlen[k]+1] <- 1
for(i in (cumlen[k]+1):cumlen[k+1]){
B[cumlen_shift[k]+1,i] <- 1
}
B[cumlen_shift[k]+1,cumlen[k]+1] <- -1
}
return(B)
}
df = expand.grid(x1=letters[1:4], x2=LETTERS[1:2], x3=factor(1:2))
Xs = sparse.model.matrix(~x1+x2+x3-1, data = df)
A = full_to_onehot(c(4, 2, 2))
###
set_attr_modelmat <- function(X) {
if (!is.null(attr(X, "assign"))) {
attr(X, "indices") <- c(0L, cumsum(rle(attr(X, "assign"))$lengths))
}
labs <- names(attr(X, "contrasts"))
if (!is.null(labs)) {
attr(X, "term.labels") <- labs
if (!is.null(X@Dimnames[[2]])) {
attr(X, "value.labels") <- sub(
paste(labs, collapse = "|"),
"",
X@Dimnames[[2]]
)
}
}
return(X)
}
df = as.data.frame(Titanic)
X_s = sparse_onehot(Freq ~ Class + Sex + Age + Survived, data = df) #冗長なワンホット
L = 2
system.time({
out_s <- mNMF_vb.default(df$Freq, X = X_s, L = L, iter = 1000)
})
X = sparse.model.matrix(Freq ~ Class + Sex + Age + Survived-1, data = df) #フルランク
X = set_attr_modelmat(X)
system.time({
out <- mNMF_vb.default(df$Freq, X = X, L = L, iter = 1000)
})
plot(out_s$ELBO[-1], type = "l", lty = 2)
lines(out$ELBO[-1], type = "l")
V = (out$shape / out$rate)
V_s = (out_s$shape / out_s$rate)
fit = product_m.default(X, V)
fit_s = product_m.default(X_s, V_s)
abline(0, 1, lty = 2)
A = full_to_onehot(c(4, 2, 2, 2))
dim(A)
dim(V)
all(X %*% A == X_s)
V2 <- exp(A %*% log(V_s))
fit2 = product_m.default(X, V2)
plot(df$Freq, fit)
points(df$Freq, fit_s, col = "royalblue", pch = 2)
points(df$Freq, fit2, col = "orangered", pch = 3)
V = V[, order(colMeans(V), decreasing = TRUE)]
V2 = V2[, order(colMeans(V2), decreasing = TRUE)]
plot(V, V2)
abline(0, 1, lty = 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment