Skip to content

Instantly share code, notes, and snippets.

@abikoushi
Created February 25, 2026 13:16
Show Gist options
  • Select an option

  • Save abikoushi/a4bd644ef4c8730175de4d7c07526c87 to your computer and use it in GitHub Desktop.

Select an option

Save abikoushi/a4bd644ef4c8730175de4d7c07526c87 to your computer and use it in GitHub Desktop.
Skip Gram With Negative Sampling using R
#####
#Reference
#Omer Levy, Yoav Goldberg (NIPS 2014)
#Neural Word Embedding as Implicit Matrix Factorization
#https://papers.nips.cc/paper_files/paper/2014/hash/b78666971ceae55a8e87efb7cbfd9ad4-Abstract.html
#####
library(torch)
library(dplyr)
#install_torch(reinstall = TRUE)
vocab_size <- 100
emb_dim <- 2
V_true <- matrix(rnorm(vocab_size*emb_dim), vocab_size, emb_dim)
U_true <- matrix(rnorm(vocab_size*emb_dim), vocab_size, emb_dim)
PMI_true <- V_true %*% t(U_true)
P <- exp(PMI_true)
P <- P / sum(P)
N_total <- 200000
counts <- rmultinom(1, size = N_total, prob = as.vector(P))
Freq <- matrix(counts, vocab_size, vocab_size)
df <- reshape2::melt(Freq)
df_pos <- dplyr::filter(df, value > 0)
head(df_pos)
center <- torch_tensor(df_pos$Var1, dtype=torch_long())
context <- torch_tensor(df_pos$Var2, dtype=torch_long())
weight <- torch_tensor(df_pos$value, dtype=torch_float())
freq_word <- numeric(vocab_size)
freq_word <- df_pos %>%
dplyr::group_by(Var1) %>%
dplyr::summarise(f = sum(value)) %>%
dplyr::right_join(
data.frame(Var1 = 1:vocab_size),
by = "Var1"
) %>%
dplyr::mutate(f = ifelse(is.na(f), 0, f)) %>%
dplyr::pull(f)
prob_neg2 <- freq_word^0.75
prob_neg2 <- prob_neg2 / sum(prob_neg2)
prob_neg <- prob_neg2
prob_neg <- dplyr::filter(df, value == 0) %>%
group_by(Var1) %>%
tally() %>%
mutate(prob = n/sum(n)) %>%
dplyr::pull(prob)
plot(prob_neg, prob_neg2)
skipgram_ns <- nn_module(
initialize = function(vocab_size, emb_dim) {
self$in_embed <- nn_embedding(vocab_size, emb_dim)
self$out_embed <- nn_embedding(vocab_size, emb_dim)
},
forward = function(center, pos_context, neg_context, weight = NULL) {
v <- self$in_embed(center)
u_pos <- self$out_embed(pos_context)
u_neg <- self$out_embed(neg_context)
pos_score <- torch_sum(v * u_pos, dim = 2)
pos_loss <- nnf_logsigmoid(pos_score)
v_exp <- v$unsqueeze(3)
neg_score <- torch_bmm(u_neg, v_exp)$squeeze(3)
neg_loss <- nnf_logsigmoid(-neg_score)$sum(dim = 2)
sample_loss <- -(pos_loss + neg_loss)
if (!is.null(weight)) {
sample_loss <- sample_loss * weight
loss <- sample_loss$sum() / weight$sum()
} else {
loss <- sample_loss$mean()
}
return(loss)
}
)
K <- 20
batch_size <- 64
n <- length(center)
model <- skipgram_ns(vocab_size, emb_dim)
optimizer <- optim_adam(model$parameters, lr=1e-3)
n_epoch <- 200
loss_history <-numeric(n_epoch)
for (epoch in seq_len(n_epoch)) {
perm <- sample.int(n)
for (i in seq(1, n, by=batch_size)) {
idx <- perm[i:min(i+batch_size-1, n)]
c_batch <- center[idx]
y_batch <- context[idx]
w_batch <- weight[idx]
# negative sampling
neg_mat <- matrix(
sample.int(vocab_size,
size = length(idx)*K,
replace = TRUE,
prob = prob_neg),
nrow = length(idx),
ncol = K
)
neg_batch <- torch_tensor(neg_mat, dtype=torch_long())
optimizer$zero_grad()
loss <- model(c_batch, y_batch, neg_batch, weight = w_batch)
loss_history[epoch] <-loss_history[epoch] + loss$item()
loss$backward()
optimizer$step()
}
}
plot(loss_history, type="l")
Vhat <- as.matrix(model$in_embed$weight$detach()$cpu())
Uhat <- as.matrix(model$out_embed$weight$detach()$cpu())
plot(PMI_true, Vhat%*%t(Uhat)+log(K), col=rgb(0,0,0,0.5))
abline(0,1, lty=2, col="grey")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment