Skip to content

Instantly share code, notes, and snippets.

@y-mitsui
Last active June 4, 2018 17:15
Show Gist options
  • Select an option

  • Save y-mitsui/032afe55eac341869f26d6675809744c to your computer and use it in GitHub Desktop.

Select an option

Save y-mitsui/032afe55eac341869f26d6675809744c to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Reference:
トピックモデルによる統計的潜在意味解析 4章
"""
from __future__ import print_function
from scipy.special import digamma
import numpy as np
import time
from sklearn.metrics import accuracy_score
import sys
class MSldaLogit:
def __init__(self, n_topic, n_iter, alpha=0.5, beta=0.5):
self.n_topic = n_topic
self.n_iter = n_iter
self.alpha = alpha
self.beta = beta
def lhood(self, theta, phi, curpus):
phi_hat = np.zeros(phi.shape)
theta_hat = np.zeros(theta.shape)
n_documents = len(curpus)
for k in range(self.n_topic):
phi_hat[k] = phi[k] / np.sum(phi[k])
for d in range(n_documents):
theta_hat[d] = theta[d] / np.sum(theta[d])
ret = 0.
for d, row_corpus in enumerate(curpus):
for w_i, w_c in row_corpus:
prob = np.dot(theta_hat[d], phi_hat[:, w_i])
ret += np.log(prob) * w_c
return ret
def hloodLogit(self, reg_weights, latent_z, sample_y, slack, group_no):
sample_X = []
for l in range(self.n_group):
if sample_y[l] != -1:
sample_X.append(self._getGroupFeature(latent_z, group_no, l))
est_y = np.dot(sample_X, reg_weights.reshape(-1, 1)).reshape((-1,))
probs = sample_y[sample_y != -1] * est_y - np.log(1 + np.exp(est_y))
return probs.sum()
def _getGroupFeature(self, latent_z, group_no, group_idx):
r = []
for doc_z in np.array(latent_z)[group_no == group_idx]:
r.append(self._getDocumentFeature(doc_z))
return np.sum(r, 0)
def _getDocumentFeature(self, latent_zd):
z = np.argmax(latent_zd, 1)
dummies = []
for each_z in z:
val = [0] * self.n_topic
val[each_z] = 1
dummies.append(val)
return np.average(dummies, 0)
def gradEta(self, reg_weights, latent_z, sample_y, slack):
delta = np.zeros(len(reg_weights))
for l in range(self.n_group):
if sample_y[l] == -1:
continue
z_reg_w_tot = 1.
for d in range(self.n_documents):
cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,))
tmp = np.sum([self.a[l, d] * np.average(z_d, 0) for d, z_d in enumerate(latent_z)], 0)
temp1 = sample_y[l] * tmp
temp2 = 0.
for d in range(self.n_documents):
if self.a[l, d] == 1.:
cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
z_reg_w = z_reg_w_tot / np.dot(latent_z[d], exp_tmp)
temp3s = self.a[l, d] * latent_z[d] / self.n_word_in_docs[d]
temp2 += np.sum(temp3s * exp_tmp.T * z_reg_w, 0)
delta += temp1 - temp2 / slack[l]
return delta
def lossF(self, reg_weights, latent_z, sample_y, slack):
loss = 0.
for l in range(self.n_group):
tmp = np.dot(latent_z[l], np.exp(reg_weights / self.n_word_in_docs[l]).reshape(-1, 1))
tmp2 = np.prod(tmp) / slack[l] - (1. / slack[l]) - np.log(slack[l]) + 1.
loss += sample_y[l] * np.dot(np.average(latent_z[l], 0), reg_weights) - tmp2
return loss
def fit(self, curpus, sample_y, group_no):
self.fit_transform(curpus, sample_y, group_no)
def fit_transform(self, curpus, sample_y, group_no):
"""
curpus: gensim's corpus format
sample_y: logistic(0 or 1) target variable. if this one have missing value, set -1
"""
group_no = np.array(group_no)
sample_y = np.array(sample_y)
self.n_documents = D = len(curpus)
self.n_group = sample_y.shape[0]
K = self.n_topic
max_index = 0
for row_corpus in curpus:
document_max = 0
for w_i, w_c in row_corpus:
if document_max < w_i:
document_max = w_i
if max_index < document_max:
max_index = document_max
V = max_index + 1
print("====== start training ======")
print("number of documents %d"%(D))
print("number of topics %d"%(K))
print("number of unique words %d"%(V))
doc_v = []
doc_count = []
for (d, row_corpus) in enumerate(curpus):
doc_v.append(np.array([x[0] for x in row_corpus]))
doc_count.append(np.array([x[1] for x in row_corpus]))
alpha = self.alpha + np.random.rand(D, K)
beta = self.beta + np.random.rand(K, V)
# estimate parameters
old_alpha = alpha.copy()
reg_weights = np.random.normal(size=self.n_topic)
slack = np.random.uniform(size=self.n_group) + 1e-3
self.n_word_in_docs = []
for d in range(self.n_documents):
self.n_word_in_docs.append(len(curpus[d]))
latent_z = []
for i in range(D):
temp = np.random.uniform(size=(self.n_word_in_docs[i], self.n_topic)) + 1e-5
temp /= np.sum(temp, 1).reshape(-1, 1)
latent_z.append(temp)
self.a = np.zeros((self.n_group, self.n_documents))
for d, l in enumerate(group_no):
self.a[l, d] = 1.
lhood_old = float('nan')
for t in range(self.n_iter):
t1 = time.time()
dig_alpha = digamma(alpha) - digamma(alpha.sum(axis = 1, keepdims = True))
dig_beta = digamma(beta) - digamma(beta.sum(axis = 1, keepdims = True))
alpha_new = np.ones((D, K)) * self.alpha
beta_new = np.ones((K, V)) * self.beta
new_latent_z = []
document_features = []
for (d, row_corpus) in enumerate(curpus):
q = np.zeros((V, K))
v = doc_v[d]
count = doc_count[d]
l = group_no[d]
if sample_y[l] != -1:
z_reg_w_tot = 1.
for d2 in range(self.n_documents):
cal_tmp = self.a[l, d2] * reg_weights / self.n_word_in_docs[d2]
exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
z_reg_w_tot *= np.prod(np.dot(latent_z[d2], exp_tmp).reshape(-1,))
temp = latent_z[d] * np.exp(reg_weights / self.n_word_in_docs[d])
z_reg_w = z_reg_w_tot / temp.sum(axis=1)
temp3 = sample_y[l] * reg_weights / self.n_word_in_docs[d]
#temp_a = np.dot(z_reg_w.reshape(-1, 1), np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1))
temp_a = z_reg_w.reshape(-1, 1) * np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1)
temp4 = temp_a / slack[l]
temp5 = temp3 - temp4
q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T
else:
q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v])).T
#count = doc_count[d]
#print("temp5", temp5[0, 0])
#q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T
q[v, :] /= q[v, :].sum(axis = 1, keepdims = True)
new_latent_z.append(q[v])
# alpha, beta
alpha_new[d, :] += count.dot(q[v])
beta_new[:, v] += count * q[v].T
document_features.append(self._getDocumentFeature(q[v]))
latent_z = new_latent_z
for l in range(self.n_group):
if sample_y[l] != -1:
z_reg_w_tot = 1.
for d in range(self.n_documents):
cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,))
temp2 = z_reg_w_tot
slack[l] = temp2 + 1.
alpha = alpha_new.copy()
beta = beta_new.copy()
reg_weights_old = reg_weights.copy()
n_innner_iter = 1000
for i in range(n_innner_iter):
reg_weights += 0.05 * self.gradEta(reg_weights, latent_z, sample_y, slack)
if i % (n_innner_iter // 3) == 0:
diff = np.abs(reg_weights_old - reg_weights)
print("eta optimization[%d] %.5f %.3f"%(i, np.max(diff), self.lossF(reg_weights, latent_z, sample_y, slack)))
reg_weights_old = reg_weights.copy()
lhood_lda = self.lhood(alpha, beta, curpus)
lhood_logit = self.hloodLogit(reg_weights, latent_z, sample_y, slack, group_no)
print("lhood[%d] %.3f(%.3f, %.3f) %.1fsec"%(t, lhood_lda + lhood_logit,
lhood_lda, lhood_logit, time.time() - t1))
latent_z = np.array(latent_z)
y_est = self._predict(reg_weights, latent_z, group_no, sample_y)
print("accuracy:%.3f"%(accuracy_score(y_est, sample_y[sample_y != -1])))
if lhood_old == lhood_old:
convergence = (lhood_old - lhood_lda - lhood_logit) / lhood_old
print("convergence:%.4f"%(convergence))
if convergence < 1e-5:
break
lhood_old = lhood_lda + lhood_logit
for k in range(self.n_topic):
beta[k] = beta[k] / np.sum(beta[k])
for d in range(D):
alpha[d] = alpha[d] / np.sum(alpha[d])
self.coefficient = reg_weights
self.phi = beta
self.theta = alpha
return document_features
def _logistic(self, coefficients, group_feature):
mu = np.dot(coefficients, group_feature)
coef_exp = np.exp(mu)
return coef_exp / (1 + coef_exp)
def _predict(self, reg_weights, latent_z, group_no, sample_y):
d_est = []
for l in range(self.n_group):
if sample_y[l] != -1:
group_feature = self._getGroupFeature(latent_z, group_no, l)
d_est.append(self._logistic(reg_weights, group_feature))
return np.array([int(x > 0.5) for x in d_est])
def predict(self, document_features):
d_est = []
for d in range(len(document_features)):
d_est.append(self._logistic(self.coefficient, document_features[d]))
return np.array([int(x > 0.5) for x in d_est])
if __name__ == "__main__":
from gensim import corpora
np.random.seed(12345)
train_text = ['This is man',
'This is woman',
'This is cat',
'I am student',
'This is Japan',
'I am OK',
'This is you',
'I am miracle',
'This is god',
'This is why',
'This is upstire',
'This is moment',
'This is I am',
'This is I am',
]
train_target = [0, 1, 1, 1, 0, 0, 1]
group_no = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in train_text]
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
slda = MSldaLogit(2, 30, alpha=1., beta=1.)
document_feature = slda.fit_transform(corpus, train_target, group_no)
document_feature=np.array(document_feature)
X = []
for l in range(len(train_target)):
X.append(np.sum(document_feature[np.array(group_no) == l], 0))
y_est = slda.predict(X)
print(y_est)
print("accuracy:%.3f"%(accuracy_score(y_est, train_target)))
np.set_printoptions(precision=3, suppress=True)
print(slda.theta)
def main1():
train_target = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]
train_text = ['This is mother',
'This is me',
'I am good',
'I am teacher',
'This is grance',
'This is JAPAN',
'I am USA',
'I am China',
'This is OK',
'This is miracle',
'I am place',
'I am grance',
'This is I am']
group_no = list(range(len(train_target)))
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in train_text]
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
slda = MSldaLogit(3, 30, alpha=1., beta=1.)
document_feature = slda.fit_transform(corpus, train_target, group_no)
y_est = slda.predict(document_feature)
print(y_est)
print("accuracy", accuracy_score(y_est, train_target))
np.set_printoptions(precision=3, suppress=True)
print(slda.theta)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment