abikoushi/SkipGramWithNegativeSampling.R

## SkipGramWithNegativeSampling.R
#####
#Reference
#Omer Levy, Yoav Goldberg (NIPS 2014)
#Neural Word Embedding as Implicit Matrix Factorization
#https://papers.nips.cc/paper_files/paper/2014/hash/b78666971ceae55a8e87efb7cbfd9ad4-Abstract.html
#####
library(torch)
library(dplyr)
#install_torch(reinstall = TRUE)

vocab_size <- 100
emb_dim <- 2

V_true <- matrix(rnorm(vocab_size*emb_dim), vocab_size, emb_dim)
U_true <- matrix(rnorm(vocab_size*emb_dim), vocab_size, emb_dim)

PMI_true <- V_true %*% t(U_true)

P <- exp(PMI_true)
P <- P / sum(P)
N_total <- 200000
counts <- rmultinom(1, size = N_total, prob = as.vector(P))
Freq <- matrix(counts, vocab_size, vocab_size)

df <- reshape2::melt(Freq)
df_pos <- dplyr::filter(df, value > 0)

head(df_pos)

center <- torch_tensor(df_pos$Var1, dtype=torch_long())
context <- torch_tensor(df_pos$Var2, dtype=torch_long())
weight  <- torch_tensor(df_pos$value, dtype=torch_float())

freq_word <- numeric(vocab_size)

freq_word <- df_pos %>%
  dplyr::group_by(Var1) %>%
  dplyr::summarise(f = sum(value)) %>%
  dplyr::right_join(
    data.frame(Var1 = 1:vocab_size),
    by = "Var1"
  ) %>%
  dplyr::mutate(f = ifelse(is.na(f), 0, f)) %>%
  dplyr::pull(f)

prob_neg2 <- freq_word^0.75
prob_neg2 <- prob_neg2 / sum(prob_neg2)
prob_neg <- prob_neg2
prob_neg <- dplyr::filter(df, value == 0) %>%
  group_by(Var1) %>%
  tally() %>%
  mutate(prob = n/sum(n)) %>%
  dplyr::pull(prob)

plot(prob_neg, prob_neg2)

skipgram_ns <- nn_module(
  initialize = function(vocab_size, emb_dim) {
    self$in_embed  <- nn_embedding(vocab_size, emb_dim)
    self$out_embed  <- nn_embedding(vocab_size, emb_dim)
  },
  forward = function(center, pos_context, neg_context, weight = NULL) {

    v <- self$in_embed(center)
    u_pos <- self$out_embed(pos_context)
    u_neg <- self$out_embed(neg_context)

    pos_score <- torch_sum(v * u_pos, dim = 2)
    pos_loss  <- nnf_logsigmoid(pos_score)

    v_exp <- v$unsqueeze(3)
    neg_score <- torch_bmm(u_neg, v_exp)$squeeze(3)
    neg_loss  <- nnf_logsigmoid(-neg_score)$sum(dim = 2)

    sample_loss <- -(pos_loss + neg_loss)

    if (!is.null(weight)) {
      sample_loss <- sample_loss * weight
      loss <- sample_loss$sum() / weight$sum()
    } else {
      loss <- sample_loss$mean()
    }

    return(loss)
  }
)

K <- 20
batch_size <- 64
n <- length(center)

model <- skipgram_ns(vocab_size, emb_dim)
optimizer <- optim_adam(model$parameters, lr=1e-3)

n_epoch <- 200
loss_history <-numeric(n_epoch)

for (epoch in seq_len(n_epoch)) {

  perm <- sample.int(n)

  for (i in seq(1, n, by=batch_size)) {

    idx <- perm[i:min(i+batch_size-1, n)]

    c_batch <- center[idx]
    y_batch <- context[idx]
    w_batch <- weight[idx]

    # negative sampling
    neg_mat <- matrix(
      sample.int(vocab_size,
                 size = length(idx)*K,
                 replace = TRUE,
                 prob = prob_neg),
      nrow = length(idx),
      ncol = K
    )

    neg_batch <- torch_tensor(neg_mat, dtype=torch_long())

    optimizer$zero_grad()

    loss <- model(c_batch, y_batch, neg_batch, weight = w_batch)

    loss_history[epoch] <-loss_history[epoch] + loss$item()
    loss$backward()
    optimizer$step()
  }

}

plot(loss_history, type="l")

Vhat  <- as.matrix(model$in_embed$weight$detach()$cpu())
Uhat  <- as.matrix(model$out_embed$weight$detach()$cpu())

plot(PMI_true, Vhat%*%t(Uhat)+log(K), col=rgb(0,0,0,0.5))
abline(0,1, lty=2, col="grey")
	#####
	#Reference
	#Omer Levy, Yoav Goldberg (NIPS 2014)
	#Neural Word Embedding as Implicit Matrix Factorization
	#https://papers.nips.cc/paper_files/paper/2014/hash/b78666971ceae55a8e87efb7cbfd9ad4-Abstract.html
	#####
	library(torch)
	library(dplyr)
	#install_torch(reinstall = TRUE)

	vocab_size <- 100
	emb_dim <- 2

	V_true <- matrix(rnorm(vocab_size*emb_dim), vocab_size, emb_dim)
	U_true <- matrix(rnorm(vocab_size*emb_dim), vocab_size, emb_dim)

	PMI_true <- V_true %*% t(U_true)

	P <- exp(PMI_true)
	P <- P / sum(P)
	N_total <- 200000
	counts <- rmultinom(1, size = N_total, prob = as.vector(P))
	Freq <- matrix(counts, vocab_size, vocab_size)

	df <- reshape2::melt(Freq)
	df_pos <- dplyr::filter(df, value > 0)

	head(df_pos)

	center <- torch_tensor(df_pos$Var1, dtype=torch_long())
	context <- torch_tensor(df_pos$Var2, dtype=torch_long())
	weight <- torch_tensor(df_pos$value, dtype=torch_float())

	freq_word <- numeric(vocab_size)

	freq_word <- df_pos %>%
	dplyr::group_by(Var1) %>%
	dplyr::summarise(f = sum(value)) %>%
	dplyr::right_join(
	data.frame(Var1 = 1:vocab_size),
	by = "Var1"
	) %>%
	dplyr::mutate(f = ifelse(is.na(f), 0, f)) %>%
	dplyr::pull(f)

	prob_neg2 <- freq_word^0.75
	prob_neg2 <- prob_neg2 / sum(prob_neg2)
	prob_neg <- prob_neg2
	prob_neg <- dplyr::filter(df, value == 0) %>%
	group_by(Var1) %>%
	tally() %>%
	mutate(prob = n/sum(n)) %>%
	dplyr::pull(prob)

	plot(prob_neg, prob_neg2)

	skipgram_ns <- nn_module(
	initialize = function(vocab_size, emb_dim) {
	self$in_embed <- nn_embedding(vocab_size, emb_dim)
	self$out_embed <- nn_embedding(vocab_size, emb_dim)
	},
	forward = function(center, pos_context, neg_context, weight = NULL) {

	v <- self$in_embed(center)
	u_pos <- self$out_embed(pos_context)
	u_neg <- self$out_embed(neg_context)

	pos_score <- torch_sum(v * u_pos, dim = 2)
	pos_loss <- nnf_logsigmoid(pos_score)

	v_exp <- v$unsqueeze(3)
	neg_score <- torch_bmm(u_neg, v_exp)$squeeze(3)
	neg_loss <- nnf_logsigmoid(-neg_score)$sum(dim = 2)

	sample_loss <- -(pos_loss + neg_loss)

	if (!is.null(weight)) {
	sample_loss <- sample_loss * weight
	loss <- sample_loss$sum() / weight$sum()
	} else {
	loss <- sample_loss$mean()
	}

	return(loss)
	}
	)

	K <- 20
	batch_size <- 64
	n <- length(center)

	model <- skipgram_ns(vocab_size, emb_dim)
	optimizer <- optim_adam(model$parameters, lr=1e-3)

	n_epoch <- 200
	loss_history <-numeric(n_epoch)

	for (epoch in seq_len(n_epoch)) {

	perm <- sample.int(n)

	for (i in seq(1, n, by=batch_size)) {

	idx <- perm[i:min(i+batch_size-1, n)]

	c_batch <- center[idx]
	y_batch <- context[idx]
	w_batch <- weight[idx]

	# negative sampling
	neg_mat <- matrix(
	sample.int(vocab_size,
	size = length(idx)*K,
	replace = TRUE,
	prob = prob_neg),
	nrow = length(idx),
	ncol = K
	)

	neg_batch <- torch_tensor(neg_mat, dtype=torch_long())

	optimizer$zero_grad()

	loss <- model(c_batch, y_batch, neg_batch, weight = w_batch)

	loss_history[epoch] <-loss_history[epoch] + loss$item()
	loss$backward()
	optimizer$step()
	}

	}

	plot(loss_history, type="l")

	Vhat <- as.matrix(model$in_embed$weight$detach()$cpu())
	Uhat <- as.matrix(model$out_embed$weight$detach()$cpu())

	plot(PMI_true, Vhat%*%t(Uhat)+log(K), col=rgb(0,0,0,0.5))
	abline(0,1, lty=2, col="grey")
No results found