y-mitsui/multiple_logistic_slda_vb.py

## multiple_logistic_slda_vb.py
# -*- coding: utf-8 -*-
"""
Reference:
    トピックモデルによる統計的潜在意味解析 4章
"""

from __future__ import print_function
from scipy.special import digamma
import numpy as np
import time
from sklearn.metrics import accuracy_score
import sys

class MSldaLogit:
    def __init__(self, n_topic, n_iter, alpha=0.5, beta=0.5):
        self.n_topic = n_topic
        self.n_iter = n_iter
        self.alpha = alpha
        self.beta = beta

    def lhood(self, theta, phi, curpus):
        phi_hat = np.zeros(phi.shape)
        theta_hat = np.zeros(theta.shape)
        n_documents = len(curpus)
        for k in range(self.n_topic):
            phi_hat[k] = phi[k] / np.sum(phi[k])
        for d in range(n_documents):
            theta_hat[d] = theta[d] / np.sum(theta[d])

        ret = 0.
        for d, row_corpus in enumerate(curpus):
            for w_i, w_c in row_corpus:
                prob = np.dot(theta_hat[d], phi_hat[:, w_i])
                ret += np.log(prob) * w_c
        return ret

    def hloodLogit(self, reg_weights, latent_z, sample_y, slack, group_no):
        sample_X = []
        for l in range(self.n_group):
            if sample_y[l] != -1:
                sample_X.append(self._getGroupFeature(latent_z, group_no, l))

        est_y = np.dot(sample_X, reg_weights.reshape(-1, 1)).reshape((-1,))
        probs = sample_y[sample_y != -1] * est_y - np.log(1 + np.exp(est_y))
        return probs.sum()

    def _getGroupFeature(self, latent_z, group_no, group_idx):
        r = []
        for doc_z in np.array(latent_z)[group_no == group_idx]:
            r.append(self._getDocumentFeature(doc_z))
        return np.sum(r, 0)

    def _getDocumentFeature(self, latent_zd):
        z = np.argmax(latent_zd, 1)
        dummies = []
        for each_z in z:
            val = [0] * self.n_topic
            val[each_z] = 1
            dummies.append(val)
        return np.average(dummies, 0)

    def gradEta(self, reg_weights, latent_z, sample_y, slack):
        delta = np.zeros(len(reg_weights))
        for l in range(self.n_group):
            if sample_y[l] == -1:
                continue

            z_reg_w_tot = 1.
            for d in range(self.n_documents):
                cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
                exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
                z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,))

            tmp = np.sum([self.a[l, d] * np.average(z_d, 0) for d, z_d in enumerate(latent_z)], 0)
            temp1 = sample_y[l] * tmp
            temp2 = 0.
            for d in range(self.n_documents):
                if self.a[l, d] == 1.:
                    cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
                    exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
                    z_reg_w = z_reg_w_tot / np.dot(latent_z[d], exp_tmp)
                    temp3s = self.a[l, d] * latent_z[d] / self.n_word_in_docs[d]
                    temp2 += np.sum(temp3s * exp_tmp.T * z_reg_w, 0)
            delta += temp1 - temp2 / slack[l]

        return delta

    def lossF(self, reg_weights, latent_z, sample_y, slack):
        loss = 0.
        for l in range(self.n_group):
            tmp = np.dot(latent_z[l], np.exp(reg_weights / self.n_word_in_docs[l]).reshape(-1, 1))
            tmp2 = np.prod(tmp) / slack[l] - (1. / slack[l]) - np.log(slack[l]) + 1.
            loss += sample_y[l] * np.dot(np.average(latent_z[l], 0), reg_weights) - tmp2
        return loss

    def fit(self, curpus, sample_y, group_no):
        self.fit_transform(curpus, sample_y, group_no)

    def fit_transform(self, curpus, sample_y, group_no):
        """
            curpus: gensim's corpus format
            sample_y: logistic(0 or 1) target variable. if this one have missing value, set -1
        """
        group_no = np.array(group_no)
        sample_y = np.array(sample_y)
        self.n_documents = D = len(curpus)
        self.n_group = sample_y.shape[0]
        K = self.n_topic

        max_index = 0
        for row_corpus in curpus:
            document_max = 0
            for w_i, w_c in row_corpus:
                if document_max < w_i:
                    document_max = w_i

            if max_index < document_max:
                max_index = document_max
        V = max_index + 1
        print("====== start training ======")
        print("number of documents %d"%(D))
        print("number of topics %d"%(K))
        print("number of unique words %d"%(V))

        doc_v = []
        doc_count = []
        for (d, row_corpus) in enumerate(curpus):
            doc_v.append(np.array([x[0] for x in row_corpus]))
            doc_count.append(np.array([x[1] for x in row_corpus]))

        alpha = self.alpha + np.random.rand(D, K)
        beta = self.beta + np.random.rand(K, V)
        # estimate parameters
        old_alpha = alpha.copy()

        reg_weights = np.random.normal(size=self.n_topic)
        slack = np.random.uniform(size=self.n_group) + 1e-3

        self.n_word_in_docs = []
        for d in range(self.n_documents):
            self.n_word_in_docs.append(len(curpus[d]))

        latent_z = []
        for i in range(D):
            temp = np.random.uniform(size=(self.n_word_in_docs[i], self.n_topic)) + 1e-5
            temp /= np.sum(temp, 1).reshape(-1, 1)
            latent_z.append(temp)

        self.a = np.zeros((self.n_group, self.n_documents))
        for d, l in enumerate(group_no):
            self.a[l, d] = 1.

        lhood_old = float('nan')
        for t in range(self.n_iter):
            t1 = time.time()
            dig_alpha = digamma(alpha) - digamma(alpha.sum(axis = 1, keepdims = True))
            dig_beta = digamma(beta) - digamma(beta.sum(axis = 1, keepdims = True))

            alpha_new = np.ones((D, K)) * self.alpha
            beta_new = np.ones((K, V)) * self.beta
            new_latent_z = []
            document_features = []
            for (d, row_corpus) in enumerate(curpus):
                q = np.zeros((V, K))
                v = doc_v[d]
                count = doc_count[d]
                l = group_no[d]

                if sample_y[l] != -1:
                    z_reg_w_tot = 1.
                    for d2 in range(self.n_documents):
                        cal_tmp = self.a[l, d2] * reg_weights / self.n_word_in_docs[d2]
                        exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
                        z_reg_w_tot *= np.prod(np.dot(latent_z[d2], exp_tmp).reshape(-1,))

                    temp = latent_z[d] * np.exp(reg_weights / self.n_word_in_docs[d])
                    z_reg_w = z_reg_w_tot / temp.sum(axis=1)
                    temp3 = sample_y[l] * reg_weights / self.n_word_in_docs[d]
                    #temp_a = np.dot(z_reg_w.reshape(-1, 1), np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1))
                    temp_a = z_reg_w.reshape(-1, 1) * np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1)
                    temp4 = temp_a / slack[l]
                    temp5 = temp3 - temp4
                    q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T
                else:
                    q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v])).T


                #count = doc_count[d]
                #print("temp5", temp5[0, 0])
                #q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T
                q[v, :] /= q[v, :].sum(axis = 1, keepdims = True)
                new_latent_z.append(q[v])
                # alpha, beta
                alpha_new[d, :] += count.dot(q[v])
                beta_new[:, v] += count * q[v].T
                document_features.append(self._getDocumentFeature(q[v]))

            latent_z = new_latent_z

            for l in range(self.n_group):
                if sample_y[l] != -1:
                    z_reg_w_tot = 1.
                    for d in range(self.n_documents):
                        cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
                        exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
                        z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,))
                    temp2 = z_reg_w_tot
                    slack[l] = temp2 + 1.
            alpha = alpha_new.copy()
            beta = beta_new.copy()

            reg_weights_old = reg_weights.copy()
            n_innner_iter = 1000
            for i in range(n_innner_iter):
                reg_weights += 0.05 * self.gradEta(reg_weights, latent_z, sample_y, slack)
                if i % (n_innner_iter // 3) == 0:
                    diff = np.abs(reg_weights_old - reg_weights)
                    print("eta optimization[%d] %.5f %.3f"%(i, np.max(diff), self.lossF(reg_weights, latent_z, sample_y, slack)))
                reg_weights_old = reg_weights.copy()

            lhood_lda = self.lhood(alpha, beta, curpus)
            lhood_logit = self.hloodLogit(reg_weights, latent_z, sample_y, slack, group_no)
            print("lhood[%d] %.3f(%.3f, %.3f)  %.1fsec"%(t, lhood_lda + lhood_logit,
                                                         lhood_lda, lhood_logit, time.time() - t1))
            latent_z = np.array(latent_z)
            y_est = self._predict(reg_weights, latent_z, group_no, sample_y)
            print("accuracy:%.3f"%(accuracy_score(y_est, sample_y[sample_y != -1])))

            if lhood_old == lhood_old:
                convergence = (lhood_old - lhood_lda - lhood_logit) / lhood_old
                print("convergence:%.4f"%(convergence))
                if convergence < 1e-5:
                    break
            lhood_old = lhood_lda + lhood_logit

        for k in range(self.n_topic):
            beta[k] = beta[k] / np.sum(beta[k])

        for d in range(D):
            alpha[d] = alpha[d] / np.sum(alpha[d])

        self.coefficient = reg_weights
        self.phi = beta
        self.theta = alpha

        return document_features

    def _logistic(self, coefficients, group_feature):
        mu = np.dot(coefficients, group_feature)
        coef_exp = np.exp(mu)
        return coef_exp / (1 + coef_exp)

    def _predict(self, reg_weights, latent_z, group_no, sample_y):
        d_est = []
        for l in range(self.n_group):
            if sample_y[l] != -1:
                group_feature = self._getGroupFeature(latent_z, group_no, l)
                d_est.append(self._logistic(reg_weights, group_feature))
        return np.array([int(x > 0.5) for x in d_est])

    def predict(self, document_features):
        d_est = []
        for d in range(len(document_features)):
            d_est.append(self._logistic(self.coefficient, document_features[d]))

        return np.array([int(x > 0.5) for x in d_est])

if __name__ == "__main__":
    from gensim import corpora
    np.random.seed(12345)

    train_text = ['This is man',
                  'This is woman',

                  'This is  cat',
                  'I am student',

                  'This is Japan',
                  'I am OK',

                  'This is you',
                  'I am miracle',

                  'This is god',
                  'This is why',

                  'This is upstire',
                  'This is moment',

                  'This is I am',
                  'This is I am',

                  ]
    train_target = [0, 1, 1, 1, 0, 0, 1]
    group_no = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]
    stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document.lower().split() if word not in stoplist]
                 for document in train_text]

    dictionary = corpora.Dictionary(texts)
    dictionary.save('/tmp/deerwester.dict')
    corpus = [dictionary.doc2bow(text) for text in texts]

    slda = MSldaLogit(2, 30, alpha=1., beta=1.)
    document_feature = slda.fit_transform(corpus, train_target, group_no)
    document_feature=np.array(document_feature)
    X = []
    for l in range(len(train_target)):
        X.append(np.sum(document_feature[np.array(group_no) == l], 0))

    y_est = slda.predict(X)
    print(y_est)
    print("accuracy:%.3f"%(accuracy_score(y_est, train_target)))

    np.set_printoptions(precision=3, suppress=True)
    print(slda.theta)

    def main1():
        train_target = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]
        train_text = ['This is mother',
                        'This is me',
                        'I am good',
                        'I am teacher',
                        'This is grance',
                        'This is JAPAN',
                        'I am USA',
                         'I am China',
                         'This is OK',
                         'This is miracle',
                         'I am place',
                         'I am grance',
                         'This is I am']
        group_no = list(range(len(train_target)))

        stoplist = set('for a of the and to in'.split())
        texts = [[word for word in document.lower().split() if word not in stoplist]
                     for document in train_text]

        dictionary = corpora.Dictionary(texts)
        dictionary.save('/tmp/deerwester.dict')
        corpus = [dictionary.doc2bow(text) for text in texts]

        slda = MSldaLogit(3, 30, alpha=1., beta=1.)
        document_feature = slda.fit_transform(corpus, train_target, group_no)
        y_est = slda.predict(document_feature)
        print(y_est)
        print("accuracy", accuracy_score(y_est, train_target))

        np.set_printoptions(precision=3, suppress=True)
        print(slda.theta)
	# -- coding: utf-8 --
	"""
	Reference:
	トピックモデルによる統計的潜在意味解析 4章
	"""

	from __future__ import print_function
	from scipy.special import digamma
	import numpy as np
	import time
	from sklearn.metrics import accuracy_score
	import sys

	class MSldaLogit:
	def __init__(self, n_topic, n_iter, alpha=0.5, beta=0.5):
	self.n_topic = n_topic
	self.n_iter = n_iter
	self.alpha = alpha
	self.beta = beta

	def lhood(self, theta, phi, curpus):
	phi_hat = np.zeros(phi.shape)
	theta_hat = np.zeros(theta.shape)
	n_documents = len(curpus)
	for k in range(self.n_topic):
	phi_hat[k] = phi[k] / np.sum(phi[k])
	for d in range(n_documents):
	theta_hat[d] = theta[d] / np.sum(theta[d])

	ret = 0.
	for d, row_corpus in enumerate(curpus):
	for w_i, w_c in row_corpus:
	prob = np.dot(theta_hat[d], phi_hat[:, w_i])
	ret += np.log(prob) * w_c
	return ret

	def hloodLogit(self, reg_weights, latent_z, sample_y, slack, group_no):
	sample_X = []
	for l in range(self.n_group):
	if sample_y[l] != -1:
	sample_X.append(self._getGroupFeature(latent_z, group_no, l))

	est_y = np.dot(sample_X, reg_weights.reshape(-1, 1)).reshape((-1,))
	probs = sample_y[sample_y != -1] * est_y - np.log(1 + np.exp(est_y))
	return probs.sum()

	def _getGroupFeature(self, latent_z, group_no, group_idx):
	r = []
	for doc_z in np.array(latent_z)[group_no == group_idx]:
	r.append(self._getDocumentFeature(doc_z))
	return np.sum(r, 0)

	def _getDocumentFeature(self, latent_zd):
	z = np.argmax(latent_zd, 1)
	dummies = []
	for each_z in z:
	val = [0] * self.n_topic
	val[each_z] = 1
	dummies.append(val)
	return np.average(dummies, 0)

	def gradEta(self, reg_weights, latent_z, sample_y, slack):
	delta = np.zeros(len(reg_weights))
	for l in range(self.n_group):
	if sample_y[l] == -1:
	continue

	z_reg_w_tot = 1.
	for d in range(self.n_documents):
	cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
	exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
	z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,))

	tmp = np.sum([self.a[l, d] * np.average(z_d, 0) for d, z_d in enumerate(latent_z)], 0)
	temp1 = sample_y[l] * tmp
	temp2 = 0.
	for d in range(self.n_documents):
	if self.a[l, d] == 1.:
	cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
	exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
	z_reg_w = z_reg_w_tot / np.dot(latent_z[d], exp_tmp)
	temp3s = self.a[l, d] * latent_z[d] / self.n_word_in_docs[d]
	temp2 += np.sum(temp3s * exp_tmp.T * z_reg_w, 0)
	delta += temp1 - temp2 / slack[l]

	return delta

	def lossF(self, reg_weights, latent_z, sample_y, slack):
	loss = 0.
	for l in range(self.n_group):
	tmp = np.dot(latent_z[l], np.exp(reg_weights / self.n_word_in_docs[l]).reshape(-1, 1))
	tmp2 = np.prod(tmp) / slack[l] - (1. / slack[l]) - np.log(slack[l]) + 1.
	loss += sample_y[l] * np.dot(np.average(latent_z[l], 0), reg_weights) - tmp2
	return loss

	def fit(self, curpus, sample_y, group_no):
	self.fit_transform(curpus, sample_y, group_no)

	def fit_transform(self, curpus, sample_y, group_no):
	"""
	curpus: gensim's corpus format
	sample_y: logistic(0 or 1) target variable. if this one have missing value, set -1
	"""
	group_no = np.array(group_no)
	sample_y = np.array(sample_y)
	self.n_documents = D = len(curpus)
	self.n_group = sample_y.shape[0]
	K = self.n_topic

	max_index = 0
	for row_corpus in curpus:
	document_max = 0
	for w_i, w_c in row_corpus:
	if document_max < w_i:
	document_max = w_i

	if max_index < document_max:
	max_index = document_max
	V = max_index + 1
	print("====== start training ======")
	print("number of documents %d"%(D))
	print("number of topics %d"%(K))
	print("number of unique words %d"%(V))

	doc_v = []
	doc_count = []
	for (d, row_corpus) in enumerate(curpus):
	doc_v.append(np.array([x[0] for x in row_corpus]))
	doc_count.append(np.array([x[1] for x in row_corpus]))

	alpha = self.alpha + np.random.rand(D, K)
	beta = self.beta + np.random.rand(K, V)
	# estimate parameters
	old_alpha = alpha.copy()

	reg_weights = np.random.normal(size=self.n_topic)
	slack = np.random.uniform(size=self.n_group) + 1e-3

	self.n_word_in_docs = []
	for d in range(self.n_documents):
	self.n_word_in_docs.append(len(curpus[d]))

	latent_z = []
	for i in range(D):
	temp = np.random.uniform(size=(self.n_word_in_docs[i], self.n_topic)) + 1e-5
	temp /= np.sum(temp, 1).reshape(-1, 1)
	latent_z.append(temp)

	self.a = np.zeros((self.n_group, self.n_documents))
	for d, l in enumerate(group_no):
	self.a[l, d] = 1.

	lhood_old = float('nan')
	for t in range(self.n_iter):
	t1 = time.time()
	dig_alpha = digamma(alpha) - digamma(alpha.sum(axis = 1, keepdims = True))
	dig_beta = digamma(beta) - digamma(beta.sum(axis = 1, keepdims = True))

	alpha_new = np.ones((D, K)) * self.alpha
	beta_new = np.ones((K, V)) * self.beta
	new_latent_z = []
	document_features = []
	for (d, row_corpus) in enumerate(curpus):
	q = np.zeros((V, K))
	v = doc_v[d]
	count = doc_count[d]
	l = group_no[d]

	if sample_y[l] != -1:
	z_reg_w_tot = 1.
	for d2 in range(self.n_documents):
	cal_tmp = self.a[l, d2] * reg_weights / self.n_word_in_docs[d2]
	exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
	z_reg_w_tot *= np.prod(np.dot(latent_z[d2], exp_tmp).reshape(-1,))

	temp = latent_z[d] * np.exp(reg_weights / self.n_word_in_docs[d])
	z_reg_w = z_reg_w_tot / temp.sum(axis=1)
	temp3 = sample_y[l] * reg_weights / self.n_word_in_docs[d]
	#temp_a = np.dot(z_reg_w.reshape(-1, 1), np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1))
	temp_a = z_reg_w.reshape(-1, 1) * np.exp(reg_weights / self.n_word_in_docs[d]).reshape(1, -1)
	temp4 = temp_a / slack[l]
	temp5 = temp3 - temp4
	q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T
	else:
	q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v])).T


	#count = doc_count[d]
	#print("temp5", temp5[0, 0])
	#q[v, :] = (np.exp(dig_alpha[d, :].reshape(-1, 1) + dig_beta[:, v] + temp5.T)).T
	q[v, :] /= q[v, :].sum(axis = 1, keepdims = True)
	new_latent_z.append(q[v])
	# alpha, beta
	alpha_new[d, :] += count.dot(q[v])
	beta_new[:, v] += count * q[v].T
	document_features.append(self._getDocumentFeature(q[v]))

	latent_z = new_latent_z

	for l in range(self.n_group):
	if sample_y[l] != -1:
	z_reg_w_tot = 1.
	for d in range(self.n_documents):
	cal_tmp = self.a[l, d] * reg_weights / self.n_word_in_docs[d]
	exp_tmp = np.exp(cal_tmp).reshape(-1, 1)
	z_reg_w_tot *= np.prod(np.dot(latent_z[d], exp_tmp).reshape(-1,))
	temp2 = z_reg_w_tot
	slack[l] = temp2 + 1.
	alpha = alpha_new.copy()
	beta = beta_new.copy()

	reg_weights_old = reg_weights.copy()
	n_innner_iter = 1000
	for i in range(n_innner_iter):
	reg_weights += 0.05 * self.gradEta(reg_weights, latent_z, sample_y, slack)
	if i % (n_innner_iter // 3) == 0:
	diff = np.abs(reg_weights_old - reg_weights)
	print("eta optimization[%d] %.5f %.3f"%(i, np.max(diff), self.lossF(reg_weights, latent_z, sample_y, slack)))
	reg_weights_old = reg_weights.copy()

	lhood_lda = self.lhood(alpha, beta, curpus)
	lhood_logit = self.hloodLogit(reg_weights, latent_z, sample_y, slack, group_no)
	print("lhood[%d] %.3f(%.3f, %.3f) %.1fsec"%(t, lhood_lda + lhood_logit,
	lhood_lda, lhood_logit, time.time() - t1))
	latent_z = np.array(latent_z)
	y_est = self._predict(reg_weights, latent_z, group_no, sample_y)
	print("accuracy:%.3f"%(accuracy_score(y_est, sample_y[sample_y != -1])))

	if lhood_old == lhood_old:
	convergence = (lhood_old - lhood_lda - lhood_logit) / lhood_old
	print("convergence:%.4f"%(convergence))
	if convergence < 1e-5:
	break
	lhood_old = lhood_lda + lhood_logit

	for k in range(self.n_topic):
	beta[k] = beta[k] / np.sum(beta[k])

	for d in range(D):
	alpha[d] = alpha[d] / np.sum(alpha[d])

	self.coefficient = reg_weights
	self.phi = beta
	self.theta = alpha

	return document_features

	def _logistic(self, coefficients, group_feature):
	mu = np.dot(coefficients, group_feature)
	coef_exp = np.exp(mu)
	return coef_exp / (1 + coef_exp)

	def _predict(self, reg_weights, latent_z, group_no, sample_y):
	d_est = []
	for l in range(self.n_group):
	if sample_y[l] != -1:
	group_feature = self._getGroupFeature(latent_z, group_no, l)
	d_est.append(self._logistic(reg_weights, group_feature))
	return np.array([int(x > 0.5) for x in d_est])

	def predict(self, document_features):
	d_est = []
	for d in range(len(document_features)):
	d_est.append(self._logistic(self.coefficient, document_features[d]))

	return np.array([int(x > 0.5) for x in d_est])

	if __name__ == "__main__":
	from gensim import corpora
	np.random.seed(12345)

	train_text = ['This is man',
	'This is woman',

	'This is cat',
	'I am student',

	'This is Japan',
	'I am OK',

	'This is you',
	'I am miracle',

	'This is god',
	'This is why',

	'This is upstire',
	'This is moment',

	'This is I am',
	'This is I am',

	]
	train_target = [0, 1, 1, 1, 0, 0, 1]
	group_no = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]
	stoplist = set('for a of the and to in'.split())
	texts = [[word for word in document.lower().split() if word not in stoplist]
	for document in train_text]

	dictionary = corpora.Dictionary(texts)
	dictionary.save('/tmp/deerwester.dict')
	corpus = [dictionary.doc2bow(text) for text in texts]

	slda = MSldaLogit(2, 30, alpha=1., beta=1.)
	document_feature = slda.fit_transform(corpus, train_target, group_no)
	document_feature=np.array(document_feature)
	X = []
	for l in range(len(train_target)):
	X.append(np.sum(document_feature[np.array(group_no) == l], 0))

	y_est = slda.predict(X)
	print(y_est)
	print("accuracy:%.3f"%(accuracy_score(y_est, train_target)))

	np.set_printoptions(precision=3, suppress=True)
	print(slda.theta)

	def main1():
	train_target = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]
	train_text = ['This is mother',
	'This is me',
	'I am good',
	'I am teacher',
	'This is grance',
	'This is JAPAN',
	'I am USA',
	'I am China',
	'This is OK',
	'This is miracle',
	'I am place',
	'I am grance',
	'This is I am']
	group_no = list(range(len(train_target)))

	stoplist = set('for a of the and to in'.split())
	texts = [[word for word in document.lower().split() if word not in stoplist]
	for document in train_text]

	dictionary = corpora.Dictionary(texts)
	dictionary.save('/tmp/deerwester.dict')
	corpus = [dictionary.doc2bow(text) for text in texts]

	slda = MSldaLogit(3, 30, alpha=1., beta=1.)
	document_feature = slda.fit_transform(corpus, train_target, group_no)
	y_est = slda.predict(document_feature)
	print(y_est)
	print("accuracy", accuracy_score(y_est, train_target))

	np.set_printoptions(precision=3, suppress=True)
	print(slda.theta)
No results found