y-mitsui/g_test.py

## g_test.py
"""
G検定による独立性の検定
"""
import numpy as np
from scipy.stats import chisqprob, chisquare

def gtest(f_obs, f_exp=None, ddof=0):
    """
    http://en.wikipedia.org/wiki/G-test
    The G test can test for goodness of fit to a distribution
    Parameters
    ----------
    f_obs : array
        observed frequencies in each category
    f_exp : array, optional
        expected frequencies in each category.  By default the categories are
        assumed to be equally likely.
    ddof : int, optional
        adjustment to the degrees of freedom for the p-value
    Returns
    -------
    chisquare statistic : float
        The chisquare test statistic
    p : float
        The p-value of the test.
    Notes
    -----
    The p-value indicates the probability that the observed distribution is
    drawn from a distribution given frequencies in expected.
    So a low p-value inidcates the distributions are different.
    Examples
    --------
    >>> gtest([9.0, 8.1, 2, 1, 0.1, 20.0], [10, 5.01, 6, 4, 2, 1])
    (117.94955444335938, 8.5298516190930345e-24)
    >>> gtest([1.01, 1.01, 4.01], [1.00, 1.00, 4.00])
    (0.060224734246730804, 0.97033649350189344)
    >>> gtest([2, 1, 6], [4, 3, 2])
    (8.2135343551635742, 0.016460903780063787)
    References
    ----------
    http://en.wikipedia.org/wiki/G-test
    """
    f_obs = np.asarray(f_obs, 'f')
    k = f_obs.shape[0]
    f_exp = np.array([np.sum(f_obs, axis=0) / float(k)] * k, 'f') \
                if f_exp is None \
                else np.asarray(f_exp, 'f')
    g = 2 * np.add.reduce(f_obs * np.log(f_obs / f_exp))
    return g, chisqprob(g, k - 1 - ddof)

def makeTable(n_sample1, n_sample2, true_prob1, true_prob2):
    sample_1 = np.random.multinomial(1, true_prob1, size=n_sample1)
    sample_2 = np.random.multinomial(1, true_prob2, size=n_sample2)
    return np.array([np.sum(sample_1, 0), np.sum(sample_2, 0)])

def expectedFreq(tab):
    tot_col = np.sum(tab, 0)
    tot_row = np.sum(tab, 1)
    expe = []
    for i in range(tab.shape[0]):
        row = []
        for j in range(tab.shape[1]):
            row.append((tot_col[j] * tot_row[i]) / np.sum(tab))
        expe.append(row)
    return np.array(expe)

sample_tab = makeTable(40, 40, [0.4, 0.6], [0.5, 0.5])
expe_tab = expectedFreq(sample_tab)

print(gtest(sample_tab.reshape(-1,), expe_tab.reshape(-1,)))
	"""
	G検定による独立性の検定
	"""
	import numpy as np
	from scipy.stats import chisqprob, chisquare

	def gtest(f_obs, f_exp=None, ddof=0):
	"""
	http://en.wikipedia.org/wiki/G-test
	The G test can test for goodness of fit to a distribution
	Parameters
	----------
	f_obs : array
	observed frequencies in each category
	f_exp : array, optional
	expected frequencies in each category. By default the categories are
	assumed to be equally likely.
	ddof : int, optional
	adjustment to the degrees of freedom for the p-value
	Returns
	-------
	chisquare statistic : float
	The chisquare test statistic
	p : float
	The p-value of the test.
	Notes
	-----
	The p-value indicates the probability that the observed distribution is
	drawn from a distribution given frequencies in expected.
	So a low p-value inidcates the distributions are different.
	Examples
	--------
	>>> gtest([9.0, 8.1, 2, 1, 0.1, 20.0], [10, 5.01, 6, 4, 2, 1])
	(117.94955444335938, 8.5298516190930345e-24)
	>>> gtest([1.01, 1.01, 4.01], [1.00, 1.00, 4.00])
	(0.060224734246730804, 0.97033649350189344)
	>>> gtest([2, 1, 6], [4, 3, 2])
	(8.2135343551635742, 0.016460903780063787)
	References
	----------
	http://en.wikipedia.org/wiki/G-test
	"""
	f_obs = np.asarray(f_obs, 'f')
	k = f_obs.shape[0]
	f_exp = np.array([np.sum(f_obs, axis=0) / float(k)] * k, 'f') \
	if f_exp is None \
	else np.asarray(f_exp, 'f')
	g = 2 * np.add.reduce(f_obs * np.log(f_obs / f_exp))
	return g, chisqprob(g, k - 1 - ddof)

	def makeTable(n_sample1, n_sample2, true_prob1, true_prob2):
	sample_1 = np.random.multinomial(1, true_prob1, size=n_sample1)
	sample_2 = np.random.multinomial(1, true_prob2, size=n_sample2)
	return np.array([np.sum(sample_1, 0), np.sum(sample_2, 0)])

	def expectedFreq(tab):
	tot_col = np.sum(tab, 0)
	tot_row = np.sum(tab, 1)
	expe = []
	for i in range(tab.shape[0]):
	row = []
	for j in range(tab.shape[1]):
	row.append((tot_col[j] * tot_row[i]) / np.sum(tab))
	expe.append(row)
	return np.array(expe)

	sample_tab = makeTable(40, 40, [0.4, 0.6], [0.5, 0.5])
	expe_tab = expectedFreq(sample_tab)

	print(gtest(sample_tab.reshape(-1,), expe_tab.reshape(-1,)))
No results found