Last active
June 4, 2018 17:15
-
-
Save y-mitsui/032afe55eac341869f26d6675809744c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Reference: | |
| トピックモデルによる統計的潜在意味解析 4章 | |
| """ | |
| from __future__ import print_function | |
| from scipy.special import digamma | |
| import numpy as np | |
| import time | |
| from sklearn.metrics import accuracy_score | |
| import sys | |
| class MSldaLogit: | |
| def __init__(self, n_topic, n_iter, alpha=0.5, beta=0.5): | |
| self.n_topic = n_topic | |
| self.n_iter = n_iter | |
| self.alpha = alpha | |
| self.beta = beta | |
| def lhood(self, theta, phi, curpus): | |
| phi_hat = np.zeros(phi.shape) | |
| theta_hat = np.zeros(theta.shape) | |
| n_documents = len(curpus) | |
| for k in range(self.n_topic): | |
| phi_hat[k] = phi[k] / np.sum(phi[k]) | |
| for d in range(n_documents): | |
| theta_hat[d] = theta[d] / np.sum(theta[d]) | |
| ret = 0. | |
| for d, row_corpus in enumerate(curpus): | |
| for w_i, w_c in row_corpus: | |
| prob = np.dot(theta_hat[d], phi_hat[:, w_i]) | |
| ret += np.log(prob) * w_c | |
| return ret | |
| def hloodLogit(self, reg_weights, latent_z, sample_y, slack, group_no): | |
| sample_X = [] | |
| for l in range(self.n_group): | |
| if sample_y[l] != -1: | |
| sample_X.append(self._getGroupFeature(latent_z, group_no, l)) | |
| est_y = np.dot(sample_X, reg_weights.reshape(-1, 1)).reshape((-1,)) | |
| probs = sample_y[sample_y != -1] * est_y - np.log(1 + np.exp(est_y)) | |
| return probs.sum() | |
| def _getGroupFeature(self, latent_z, group_no, group_idx): | |
| r = [] | |
| for doc_z in np.array(latent_z)[group_no == group_idx]: | |
| r.append(self._getDocumentFeature(doc_z)) | |
| return np.sum(r, 0) | |
| def _getDocumentFeature(self, latent_zd): | |
| z = np.argmax(latent_zd, 1) | |
| dummies = [] | |
| for each_z in z: | |
| val = [0] * self.n_topic | |
| val[each_z] = 1 | |
| dummies.append(val) | |
| return np.average(dummies, 0) | |
| def gradEta(self, reg_weights, latent_z, sample_y, slack): | |
| delta = np.zeros(len(reg_weights)) | |
| for l in range(self.n_group): | |
| if sample_y[l] == -1: | |
| continue | |
| z_reg_w_tot = 1. | |
| for d in range(self.n_documents): | |
| cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d] | |
| exp_tmp = np.exp(cal_tmp).reshape(-1, 1) | |
| z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,)) | |
| tmp = np.sum([self.a[l, d] * np.average(z_d, 0) for d, z_d in enumerate(latent_z)], 0) | |
| temp1 = sample_y[l] * tmp | |
| temp2 = 0. | |
| for d in range(self.n_documents): | |
| if self.a[l, d] == 1.: | |
| cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d] | |
| exp_tmp = np.exp(cal_tmp).reshape(-1, 1) | |
| z_reg_w = z_reg_w_tot / np.dot(latent_z[d], exp_tmp) | |
| temp3s = self.a[l, d] * latent_z[d] / self.n_word_in_docs[d] | |
| temp2 += np.sum(temp3s * exp_tmp.T * z_reg_w, 0) | |
| delta += temp1 - temp2 / slack[l] | |
| return delta | |
| def lossF(self, reg_weights, latent_z, sample_y, slack): | |
| loss = 0. | |
| for l in range(self.n_group): | |
| tmp = np.dot(latent_z[l], np.exp(reg_weights / self.n_word_in_docs[l]).reshape(-1, 1)) | |
| tmp2 = np.prod(tmp) / slack[l] - (1. / slack[l]) - np.log(slack[l]) + 1. | |
| loss += sample_y[l] * np.dot(np.average(latent_z[l], 0), reg_weights) - tmp2 | |
| return loss | |
| def fit(self, curpus, sample_y, group_no): | |
| self.fit_transform(curpus, sample_y, group_no) | |
| def fit_transform(self, curpus, sample_y, group_no): | |
| """ | |
| curpus: gensim's corpus format | |
| sample_y: logistic(0 or 1) target variable. if this one have missing value, set -1 | |
| """ | |
| group_no = np.array(group_no) | |
| sample_y = np.array(sample_y) | |
| self.n_documents = D = len(curpus) | |
| self.n_group = sample_y.shape[0] | |
| K = self.n_topic | |
| max_index = 0 | |
| for row_corpus in curpus: | |
| document_max = 0 | |
| for w_i, w_c in row_corpus: | |
| if document_max < w_i: | |
| document_max = w_i | |
| if max_index < document_max: | |
| max_index = document_max | |
| V = max_index + 1 | |
| print("====== start training ======") | |
| print("number of documents %d"%(D)) | |
| print("number of topics %d"%(K)) | |
| print("number of unique words %d"%(V)) | |
| doc_v = [] | |
| doc_count = [] | |
| for (d, row_corpus) in enumerate(curpus): | |
| doc_v.append(np.array([x[0] for x in row_corpus])) | |
| doc_count.append(np.array([x[1] for x in row_corpus])) | |
| alpha = self.alpha + np.random.rand(D, K) | |
| beta = self.beta + np.random.rand(K, V) | |
| # estimate parameters | |
| old_alpha = alpha.copy() | |
| reg_weights = np.random.normal(size=self.n_topic) | |
| slack = np.random.uniform(size=self.n_group) + 1e-3 | |
| self.n_word_in_docs = [] | |
| for d in range(self.n_documents): | |
| self.n_word_in_docs.append(len(curpus[d])) | |
| latent_z = [] | |
| for i in range(D): | |
| temp = np.random.uniform(size=(self.n_word_in_docs[i], self.n_topic)) + 1e-5 | |
| temp /= np.sum(temp, 1).reshape(-1, 1) | |
| latent_z.append(temp) | |
| self.a = np.zeros((self.n_group, self.n_documents)) | |
| for d, l in enumerate(group_no): | |
| self.a[l, d] = 1. | |
| lhood_old = float('nan') | |
| for t in range(self.n_iter): | |
| t1 = time.time() | |
| dig_alpha = digamma(alpha) - digamma(alpha.sum(axis = 1, keepdims = True)) | |
| dig_beta = digamma(beta) - digamma(beta.sum(axis = 1, keepdims = True)) | |
| alpha_new = np.ones((D, K)) * self.alpha | |
| beta_new = np.ones((K, V)) * self.beta | |
| new_latent_z = [] | |
| document_features = [] | |
| for (d, row_corpus) in enumerate(curpus): | |
| q = np.zeros((V, K)) | |
| v = doc_v[d] | |
| count = doc_count[d] | |
| l = group_no[d] | |
| if sample_y[l] != -1: | |
| z_reg_w_tot = 1. | |
| for d2 in range(self.n_documents): | |
| cal_tmp = self.a[l, d2] * reg_weights / self.n_word_in_docs[d2] | |
| exp_tmp = np.exp(cal_tmp).reshape(-1, 1) | |
| z_reg_w_tot *= np.prod(np.dot(latent_z[d2], exp_tmp).reshape(-1,)) | |
| temp = latent_z[d] * np.exp(reg_weights / self.n_word_in_docs[d]) | |
| z_reg_w = z_reg_w_tot / temp.sum(axis=1) | |
| temp3 = sample_y[l] * reg_weights / self.n_word_in_docs[d] | |
| #temp_a = np.dot(z_reg_w.reshape(-1, 1), np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1)) | |
| temp_a = z_reg_w.reshape(-1, 1) * np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1) | |
| temp4 = temp_a / slack[l] | |
| temp5 = temp3 - temp4 | |
| q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T | |
| else: | |
| q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v])).T | |
| #count = doc_count[d] | |
| #print("temp5", temp5[0, 0]) | |
| #q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T | |
| q[v, :] /= q[v, :].sum(axis = 1, keepdims = True) | |
| new_latent_z.append(q[v]) | |
| # alpha, beta | |
| alpha_new[d, :] += count.dot(q[v]) | |
| beta_new[:, v] += count * q[v].T | |
| document_features.append(self._getDocumentFeature(q[v])) | |
| latent_z = new_latent_z | |
| for l in range(self.n_group): | |
| if sample_y[l] != -1: | |
| z_reg_w_tot = 1. | |
| for d in range(self.n_documents): | |
| cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d] | |
| exp_tmp = np.exp(cal_tmp).reshape(-1, 1) | |
| z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,)) | |
| temp2 = z_reg_w_tot | |
| slack[l] = temp2 + 1. | |
| alpha = alpha_new.copy() | |
| beta = beta_new.copy() | |
| reg_weights_old = reg_weights.copy() | |
| n_innner_iter = 1000 | |
| for i in range(n_innner_iter): | |
| reg_weights += 0.05 * self.gradEta(reg_weights, latent_z, sample_y, slack) | |
| if i % (n_innner_iter // 3) == 0: | |
| diff = np.abs(reg_weights_old - reg_weights) | |
| print("eta optimization[%d] %.5f %.3f"%(i, np.max(diff), self.lossF(reg_weights, latent_z, sample_y, slack))) | |
| reg_weights_old = reg_weights.copy() | |
| lhood_lda = self.lhood(alpha, beta, curpus) | |
| lhood_logit = self.hloodLogit(reg_weights, latent_z, sample_y, slack, group_no) | |
| print("lhood[%d] %.3f(%.3f, %.3f) %.1fsec"%(t, lhood_lda + lhood_logit, | |
| lhood_lda, lhood_logit, time.time() - t1)) | |
| latent_z = np.array(latent_z) | |
| y_est = self._predict(reg_weights, latent_z, group_no, sample_y) | |
| print("accuracy:%.3f"%(accuracy_score(y_est, sample_y[sample_y != -1]))) | |
| if lhood_old == lhood_old: | |
| convergence = (lhood_old - lhood_lda - lhood_logit) / lhood_old | |
| print("convergence:%.4f"%(convergence)) | |
| if convergence < 1e-5: | |
| break | |
| lhood_old = lhood_lda + lhood_logit | |
| for k in range(self.n_topic): | |
| beta[k] = beta[k] / np.sum(beta[k]) | |
| for d in range(D): | |
| alpha[d] = alpha[d] / np.sum(alpha[d]) | |
| self.coefficient = reg_weights | |
| self.phi = beta | |
| self.theta = alpha | |
| return document_features | |
| def _logistic(self, coefficients, group_feature): | |
| mu = np.dot(coefficients, group_feature) | |
| coef_exp = np.exp(mu) | |
| return coef_exp / (1 + coef_exp) | |
| def _predict(self, reg_weights, latent_z, group_no, sample_y): | |
| d_est = [] | |
| for l in range(self.n_group): | |
| if sample_y[l] != -1: | |
| group_feature = self._getGroupFeature(latent_z, group_no, l) | |
| d_est.append(self._logistic(reg_weights, group_feature)) | |
| return np.array([int(x > 0.5) for x in d_est]) | |
| def predict(self, document_features): | |
| d_est = [] | |
| for d in range(len(document_features)): | |
| d_est.append(self._logistic(self.coefficient, document_features[d])) | |
| return np.array([int(x > 0.5) for x in d_est]) | |
| if __name__ == "__main__": | |
| from gensim import corpora | |
| np.random.seed(12345) | |
| train_text = ['This is man', | |
| 'This is woman', | |
| 'This is cat', | |
| 'I am student', | |
| 'This is Japan', | |
| 'I am OK', | |
| 'This is you', | |
| 'I am miracle', | |
| 'This is god', | |
| 'This is why', | |
| 'This is upstire', | |
| 'This is moment', | |
| 'This is I am', | |
| 'This is I am', | |
| ] | |
| train_target = [0, 1, 1, 1, 0, 0, 1] | |
| group_no = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6] | |
| stoplist = set('for a of the and to in'.split()) | |
| texts = [[word for word in document.lower().split() if word not in stoplist] | |
| for document in train_text] | |
| dictionary = corpora.Dictionary(texts) | |
| dictionary.save('/tmp/deerwester.dict') | |
| corpus = [dictionary.doc2bow(text) for text in texts] | |
| slda = MSldaLogit(2, 30, alpha=1., beta=1.) | |
| document_feature = slda.fit_transform(corpus, train_target, group_no) | |
| document_feature=np.array(document_feature) | |
| X = [] | |
| for l in range(len(train_target)): | |
| X.append(np.sum(document_feature[np.array(group_no) == l], 0)) | |
| y_est = slda.predict(X) | |
| print(y_est) | |
| print("accuracy:%.3f"%(accuracy_score(y_est, train_target))) | |
| np.set_printoptions(precision=3, suppress=True) | |
| print(slda.theta) | |
| def main1(): | |
| train_target = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0] | |
| train_text = ['This is mother', | |
| 'This is me', | |
| 'I am good', | |
| 'I am teacher', | |
| 'This is grance', | |
| 'This is JAPAN', | |
| 'I am USA', | |
| 'I am China', | |
| 'This is OK', | |
| 'This is miracle', | |
| 'I am place', | |
| 'I am grance', | |
| 'This is I am'] | |
| group_no = list(range(len(train_target))) | |
| stoplist = set('for a of the and to in'.split()) | |
| texts = [[word for word in document.lower().split() if word not in stoplist] | |
| for document in train_text] | |
| dictionary = corpora.Dictionary(texts) | |
| dictionary.save('/tmp/deerwester.dict') | |
| corpus = [dictionary.doc2bow(text) for text in texts] | |
| slda = MSldaLogit(3, 30, alpha=1., beta=1.) | |
| document_feature = slda.fit_transform(corpus, train_target, group_no) | |
| y_est = slda.predict(document_feature) | |
| print(y_est) | |
| print("accuracy", accuracy_score(y_est, train_target)) | |
| np.set_printoptions(precision=3, suppress=True) | |
| print(slda.theta) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment