sbirch/classify.py

## classify.py
from svmutil import *
from os import devnull
import itertools

# Hello there!

# You should read the commentary in seven_steps_to_OK_performance, although
# you won't have to implement anything other than the data transformation
# (as it's somewhat tedious to go through the LibSVM docs, and they provide
# a similar module -- easy.py -- that does substantially the same thing
# as this code.)

# You will call this method at the bottom of the file with your munged data.
# See the docstring for the format of the arguments.

def seven_steps_to_OK_performance(labels, data, tune_size=150, train_size=5000):
	'''
	A procedure implementing the procedure from Hsu et al. for practical
	support vector classification. Does not return anything, but prints
	all the info you'll need.

	This procedure does not transform or scale the data -- you have to do that.

	Arguments:
		labels: a list of the labels for each data point. The labels must be
		        converted to numbers (a LibSVM thing.) I recommend 0 and 1, not
		        that the choice matters.
		data: a list of feature vectors (which are lists of numbers).

	Example:

	seven_steps_to_OK_performance(
		# Labels for the 3 data points
		[1, 0, 1],
		[
			# These are feature vectors:
			[0.5, 1, 0, 0.2],
			[0.2, 0, 1, 0.3],
			[0.12, 1, 1, 0.4]
	])
	'''

	assert len(labels) == len(data)

	# We split the data up into three parts: tune, train, and test.
	# The two numbers specify the boundaries in the dataset. Tuning data
	# are (by default, specificed in the parameters above) the first 150 data
	# points, training the 5000 after that, and the remainder are test data.

	# We do this because tuning is very expensive -- with the grid searches
	# used here, we have to perform 5-fold cross-validation about 500 times,
	# which means it has to train on a dataset 2500 times!

	# We hope that the parameters we get out of tuning on the small dataset
	# are relevant to the training set, which is separate and much larger.
	# Likewise, we hope that the training dataset is an accurate model of
	# the test set.

	# Note that even though computing the RBF kernel is O(n^2) we can still
	# have quite reasonably sized training sets (just probably not millions).
	# If you're curious, try seeing what impact the training size has on the
	# accuracy during the test (hint: diminishing returns.)

	labels_tune = labels[:tune_size]
	data_tune = data[:tune_size]

	labels_train = labels[tune_size:tune_size+train_size]
	data_train = data[tune_size:tune_size+train_size]

	labels_test = labels[tune_size+train_size:]
	data_test = data[tune_size+train_size:]

	# The RBF kernel has two parameters, C and gamma. C is a general parameter
	# to all sorts of SVMs which controls how to penalize misclassifications.
	# Gamma appears in the radial basis kernel function (the replacement dot product):
	# 	K(a, b) = exp(-gamma * distance(a,b)^2)
	# Where a and b are feature vectors.

	# LibSVM initializes C to 1, and gamma to 1/number of features. Gamma has
	# a nice theoretical interpretation scaling the distance term -- the higher
	# dimensional the feature space the larger distances will tend to be, and
	# the smaller gamma should be to compensate.

	# Instead of using the defaults, we tune using grid search. Grid search is
	# a naive, utterly brute force method which tries every combination of
	# values in a grid. Hsu et al. advocate for grid search in the guide
	# because it's simple, parallelizable, and not too slow when there are
	# only two dimensions (i.e. C and gamma.)

	print 'Tune size =', len(labels_tune)
	prob = svm_problem(labels_tune, data_tune)

	# We use 5-fold cross-validation as the evaluation metric for given
	# C and gamma.
	null_device = open(devnull, 'w')
	def grid_search_evaluate(C, gamma):
		param = encode_params(C, gamma, cv=5)

		# For some reason cross validation ignores the -q flag.
		# This will redirect any output during CV to the null device.
		original_stdout = sys.stdout
		sys.stdout = null_device
		cv_accuracy = svm_train(prob, param)
		sys.stdout = original_stdout

		return cv_accuracy

	# We do a two-stage grid search. The first stage is on an exponential grid
	# to determine what order of magnitude the parameters should be, searching
	# from 2^-5 to 2^15. The second grid search looks in an area 50% around
	# the exponential values to try to tune them a little more finely.

	# Coarse exponential search
	print 'Tuning with coarse grid search...'
	best_accuracy, (C, gamma) = grid_search(
		([2.0**x for x in arange(-5, 15)], [2.0**x for x in arange(-5, 15)]),
		grid_search_evaluate
	)
	print 'Best tuned accuracy =', best_accuracy
	print 'C, gamma =', C, gamma

	# Local optimization (best point within 50% of coarse coordinates)
	print 'Optimizing locally...'
	best_accuracy, (C, gamma) = grid_search(
		([C*x for x in arange(0.5, 1.5, 0.1)], [gamma*x for x in arange(0.5, 1.5, 0.1)]),
		grid_search_evaluate
	)
	print 'Best tuned accuracy =', best_accuracy
	print 'C, gamma =', C, gamma

	# Train on the training set (which is larger)
	print 'Training the full model...'
	print 'Train size =', len(labels_train)
	train_prob = svm_problem(labels_train, data_train)
	param = encode_params(C, gamma)
	model = svm_train(train_prob, param)

	# Test on the remainder of the data.
	# In a typical application we'd use the model we just trained to predict
	# new datapoints where the labels are unknown and hope that the model gets
	# them right. Instead, we're just using the rest of the data (with known
	# labels) to evaluate the model.

	print 'Testing the full model...'
	print 'Test size =', len(labels_test)
	predicted_labels, (accuracy, mse, scc), p_val = svm_predict(
		labels_test, data_test, model, '-q -b 1')

	print 'Test accuracy =', accuracy

def arange(start, stop, step=1.0):
	'''Like range(), but allows floats.'''
	while start < stop:
		yield start
		start += step

def encode_params(C, gamma, cv=None):
	'''Construct a svm_parameter object, as used for svm_train.

	-q supress output
	-t 2 = RBF
	-b
	-c is C parameter
	-g is gamma parameter
	-v 5 is 5-fold cross-validation (does NOT return model, but CV accuracy)
	'''
	if cv == None:
		return svm_parameter('-q -t 2 -c %f -b 1 -g %f' % (C, gamma))
	else:
		return svm_parameter('-q -t 2 -c %f -b 1 -g %f -v %d' % (C, gamma, cv))

def grid_search(dimensions, evaluator):
	''' A naive grid search, takes:
			- dimensions, a list of dimensions, each one of which is a list of
			  values to try.
			- evaluator, a function mapping from coordinates to a value

		Returns the coordinates maximizing the evaluator.
	'''
	best_value = None
	best_coordinates = None

	for coordinates in itertools.product(*dimensions):
		val = evaluator(*coordinates)
		if best_value is None or val > best_value:
			best_value = val
			best_coordinates = coordinates

	return best_value, best_coordinates

# Write your transformation code here!
#  - Load the adults.csv data (I recommend Python's csv module)
#  - Extract the labels (in the last column) and transform them to numbers. Make
#    sure you remove the label column from the feature data!
#  - Transform the other columns

# Eventually, assign these. See docstring for seven_steps_to_OK_performance.
#labels =
#data =

# Note that you can also call this function with keyword parameters
# tune_size and train_size if you want to test your code with something
# quicker. Try tune_size=50 and train_size=500 (should take less than 10s.)
seven_steps_to_OK_performance(labels, data)
	from svmutil import *
	from os import devnull
	import itertools

	# Hello there!

	# You should read the commentary in seven_steps_to_OK_performance, although
	# you won't have to implement anything other than the data transformation
	# (as it's somewhat tedious to go through the LibSVM docs, and they provide
	# a similar module -- easy.py -- that does substantially the same thing
	# as this code.)

	# You will call this method at the bottom of the file with your munged data.
	# See the docstring for the format of the arguments.

	def seven_steps_to_OK_performance(labels, data, tune_size=150, train_size=5000):
	'''
	A procedure implementing the procedure from Hsu et al. for practical
	support vector classification. Does not return anything, but prints
	all the info you'll need.

	This procedure does not transform or scale the data -- you have to do that.

	Arguments:
	labels: a list of the labels for each data point. The labels must be
	converted to numbers (a LibSVM thing.) I recommend 0 and 1, not
	that the choice matters.
	data: a list of feature vectors (which are lists of numbers).

	Example:

	seven_steps_to_OK_performance(
	# Labels for the 3 data points
	[1, 0, 1],
	[
	# These are feature vectors:
	[0.5, 1, 0, 0.2],
	[0.2, 0, 1, 0.3],
	[0.12, 1, 1, 0.4]
	])
	'''

	assert len(labels) == len(data)

	# We split the data up into three parts: tune, train, and test.
	# The two numbers specify the boundaries in the dataset. Tuning data
	# are (by default, specificed in the parameters above) the first 150 data
	# points, training the 5000 after that, and the remainder are test data.

	# We do this because tuning is very expensive -- with the grid searches
	# used here, we have to perform 5-fold cross-validation about 500 times,
	# which means it has to train on a dataset 2500 times!

	# We hope that the parameters we get out of tuning on the small dataset
	# are relevant to the training set, which is separate and much larger.
	# Likewise, we hope that the training dataset is an accurate model of
	# the test set.

	# Note that even though computing the RBF kernel is O(n^2) we can still
	# have quite reasonably sized training sets (just probably not millions).
	# If you're curious, try seeing what impact the training size has on the
	# accuracy during the test (hint: diminishing returns.)

	labels_tune = labels[:tune_size]
	data_tune = data[:tune_size]

	labels_train = labels[tune_size:tune_size+train_size]
	data_train = data[tune_size:tune_size+train_size]

	labels_test = labels[tune_size+train_size:]
	data_test = data[tune_size+train_size:]

	# The RBF kernel has two parameters, C and gamma. C is a general parameter
	# to all sorts of SVMs which controls how to penalize misclassifications.
	# Gamma appears in the radial basis kernel function (the replacement dot product):
	# K(a, b) = exp(-gamma * distance(a,b)^2)
	# Where a and b are feature vectors.

	# LibSVM initializes C to 1, and gamma to 1/number of features. Gamma has
	# a nice theoretical interpretation scaling the distance term -- the higher
	# dimensional the feature space the larger distances will tend to be, and
	# the smaller gamma should be to compensate.

	# Instead of using the defaults, we tune using grid search. Grid search is
	# a naive, utterly brute force method which tries every combination of
	# values in a grid. Hsu et al. advocate for grid search in the guide
	# because it's simple, parallelizable, and not too slow when there are
	# only two dimensions (i.e. C and gamma.)

	print 'Tune size =', len(labels_tune)
	prob = svm_problem(labels_tune, data_tune)

	# We use 5-fold cross-validation as the evaluation metric for given
	# C and gamma.
	null_device = open(devnull, 'w')
	def grid_search_evaluate(C, gamma):
	param = encode_params(C, gamma, cv=5)

	# For some reason cross validation ignores the -q flag.
	# This will redirect any output during CV to the null device.
	original_stdout = sys.stdout
	sys.stdout = null_device
	cv_accuracy = svm_train(prob, param)
	sys.stdout = original_stdout

	return cv_accuracy

	# We do a two-stage grid search. The first stage is on an exponential grid
	# to determine what order of magnitude the parameters should be, searching
	# from 2^-5 to 2^15. The second grid search looks in an area 50% around
	# the exponential values to try to tune them a little more finely.

	# Coarse exponential search
	print 'Tuning with coarse grid search...'
	best_accuracy, (C, gamma) = grid_search(
	([2.0x for x in arange(-5, 15)], [2.0x for x in arange(-5, 15)]),
	grid_search_evaluate
	)
	print 'Best tuned accuracy =', best_accuracy
	print 'C, gamma =', C, gamma

	# Local optimization (best point within 50% of coarse coordinates)
	print 'Optimizing locally...'
	best_accuracy, (C, gamma) = grid_search(
	([Cx for x in arange(0.5, 1.5, 0.1)], [gammax for x in arange(0.5, 1.5, 0.1)]),
	grid_search_evaluate
	)
	print 'Best tuned accuracy =', best_accuracy
	print 'C, gamma =', C, gamma

	# Train on the training set (which is larger)
	print 'Training the full model...'
	print 'Train size =', len(labels_train)
	train_prob = svm_problem(labels_train, data_train)
	param = encode_params(C, gamma)
	model = svm_train(train_prob, param)

	# Test on the remainder of the data.
	# In a typical application we'd use the model we just trained to predict
	# new datapoints where the labels are unknown and hope that the model gets
	# them right. Instead, we're just using the rest of the data (with known
	# labels) to evaluate the model.

	print 'Testing the full model...'
	print 'Test size =', len(labels_test)
	predicted_labels, (accuracy, mse, scc), p_val = svm_predict(
	labels_test, data_test, model, '-q -b 1')

	print 'Test accuracy =', accuracy

	def arange(start, stop, step=1.0):
	'''Like range(), but allows floats.'''
	while start < stop:
	yield start
	start += step

	def encode_params(C, gamma, cv=None):
	'''Construct a svm_parameter object, as used for svm_train.

	-q supress output
	-t 2 = RBF
	-b
	-c is C parameter
	-g is gamma parameter
	-v 5 is 5-fold cross-validation (does NOT return model, but CV accuracy)
	'''
	if cv == None:
	return svm_parameter('-q -t 2 -c %f -b 1 -g %f' % (C, gamma))
	else:
	return svm_parameter('-q -t 2 -c %f -b 1 -g %f -v %d' % (C, gamma, cv))

	def grid_search(dimensions, evaluator):
	''' A naive grid search, takes:
	- dimensions, a list of dimensions, each one of which is a list of
	values to try.
	- evaluator, a function mapping from coordinates to a value

	Returns the coordinates maximizing the evaluator.
	'''
	best_value = None
	best_coordinates = None

	for coordinates in itertools.product(*dimensions):
	val = evaluator(*coordinates)
	if best_value is None or val > best_value:
	best_value = val
	best_coordinates = coordinates

	return best_value, best_coordinates

	# Write your transformation code here!
	# - Load the adults.csv data (I recommend Python's csv module)
	# - Extract the labels (in the last column) and transform them to numbers. Make
	# sure you remove the label column from the feature data!
	# - Transform the other columns

	# Eventually, assign these. See docstring for seven_steps_to_OK_performance.
	#labels =
	#data =

	# Note that you can also call this function with keyword parameters
	# tune_size and train_size if you want to test your code with something
	# quicker. Try tune_size=50 and train_size=500 (should take less than 10s.)
	seven_steps_to_OK_performance(labels, data)
No results found