Last active
December 27, 2015 23:19
-
-
Save sbirch/7405187 to your computer and use it in GitHub Desktop.
Homework stencil for SVM / Brown CS195W
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from svmutil import * | |
| from os import devnull | |
| import itertools | |
| # Hello there! | |
| # You should read the commentary in seven_steps_to_OK_performance, although | |
| # you won't have to implement anything other than the data transformation | |
| # (as it's somewhat tedious to go through the LibSVM docs, and they provide | |
| # a similar module -- easy.py -- that does substantially the same thing | |
| # as this code.) | |
| # You will call this method at the bottom of the file with your munged data. | |
| # See the docstring for the format of the arguments. | |
| def seven_steps_to_OK_performance(labels, data, tune_size=150, train_size=5000): | |
| ''' | |
| A procedure implementing the procedure from Hsu et al. for practical | |
| support vector classification. Does not return anything, but prints | |
| all the info you'll need. | |
| This procedure does not transform or scale the data -- you have to do that. | |
| Arguments: | |
| labels: a list of the labels for each data point. The labels must be | |
| converted to numbers (a LibSVM thing.) I recommend 0 and 1, not | |
| that the choice matters. | |
| data: a list of feature vectors (which are lists of numbers). | |
| Example: | |
| seven_steps_to_OK_performance( | |
| # Labels for the 3 data points | |
| [1, 0, 1], | |
| [ | |
| # These are feature vectors: | |
| [0.5, 1, 0, 0.2], | |
| [0.2, 0, 1, 0.3], | |
| [0.12, 1, 1, 0.4] | |
| ]) | |
| ''' | |
| assert len(labels) == len(data) | |
| # We split the data up into three parts: tune, train, and test. | |
| # The two numbers specify the boundaries in the dataset. Tuning data | |
| # are (by default, specificed in the parameters above) the first 150 data | |
| # points, training the 5000 after that, and the remainder are test data. | |
| # We do this because tuning is very expensive -- with the grid searches | |
| # used here, we have to perform 5-fold cross-validation about 500 times, | |
| # which means it has to train on a dataset 2500 times! | |
| # We hope that the parameters we get out of tuning on the small dataset | |
| # are relevant to the training set, which is separate and much larger. | |
| # Likewise, we hope that the training dataset is an accurate model of | |
| # the test set. | |
| # Note that even though computing the RBF kernel is O(n^2) we can still | |
| # have quite reasonably sized training sets (just probably not millions). | |
| # If you're curious, try seeing what impact the training size has on the | |
| # accuracy during the test (hint: diminishing returns.) | |
| labels_tune = labels[:tune_size] | |
| data_tune = data[:tune_size] | |
| labels_train = labels[tune_size:tune_size+train_size] | |
| data_train = data[tune_size:tune_size+train_size] | |
| labels_test = labels[tune_size+train_size:] | |
| data_test = data[tune_size+train_size:] | |
| # The RBF kernel has two parameters, C and gamma. C is a general parameter | |
| # to all sorts of SVMs which controls how to penalize misclassifications. | |
| # Gamma appears in the radial basis kernel function (the replacement dot product): | |
| # K(a, b) = exp(-gamma * distance(a,b)^2) | |
| # Where a and b are feature vectors. | |
| # LibSVM initializes C to 1, and gamma to 1/number of features. Gamma has | |
| # a nice theoretical interpretation scaling the distance term -- the higher | |
| # dimensional the feature space the larger distances will tend to be, and | |
| # the smaller gamma should be to compensate. | |
| # Instead of using the defaults, we tune using grid search. Grid search is | |
| # a naive, utterly brute force method which tries every combination of | |
| # values in a grid. Hsu et al. advocate for grid search in the guide | |
| # because it's simple, parallelizable, and not too slow when there are | |
| # only two dimensions (i.e. C and gamma.) | |
| print 'Tune size =', len(labels_tune) | |
| prob = svm_problem(labels_tune, data_tune) | |
| # We use 5-fold cross-validation as the evaluation metric for given | |
| # C and gamma. | |
| null_device = open(devnull, 'w') | |
| def grid_search_evaluate(C, gamma): | |
| param = encode_params(C, gamma, cv=5) | |
| # For some reason cross validation ignores the -q flag. | |
| # This will redirect any output during CV to the null device. | |
| original_stdout = sys.stdout | |
| sys.stdout = null_device | |
| cv_accuracy = svm_train(prob, param) | |
| sys.stdout = original_stdout | |
| return cv_accuracy | |
| # We do a two-stage grid search. The first stage is on an exponential grid | |
| # to determine what order of magnitude the parameters should be, searching | |
| # from 2^-5 to 2^15. The second grid search looks in an area 50% around | |
| # the exponential values to try to tune them a little more finely. | |
| # Coarse exponential search | |
| print 'Tuning with coarse grid search...' | |
| best_accuracy, (C, gamma) = grid_search( | |
| ([2.0**x for x in arange(-5, 15)], [2.0**x for x in arange(-5, 15)]), | |
| grid_search_evaluate | |
| ) | |
| print 'Best tuned accuracy =', best_accuracy | |
| print 'C, gamma =', C, gamma | |
| # Local optimization (best point within 50% of coarse coordinates) | |
| print 'Optimizing locally...' | |
| best_accuracy, (C, gamma) = grid_search( | |
| ([C*x for x in arange(0.5, 1.5, 0.1)], [gamma*x for x in arange(0.5, 1.5, 0.1)]), | |
| grid_search_evaluate | |
| ) | |
| print 'Best tuned accuracy =', best_accuracy | |
| print 'C, gamma =', C, gamma | |
| # Train on the training set (which is larger) | |
| print 'Training the full model...' | |
| print 'Train size =', len(labels_train) | |
| train_prob = svm_problem(labels_train, data_train) | |
| param = encode_params(C, gamma) | |
| model = svm_train(train_prob, param) | |
| # Test on the remainder of the data. | |
| # In a typical application we'd use the model we just trained to predict | |
| # new datapoints where the labels are unknown and hope that the model gets | |
| # them right. Instead, we're just using the rest of the data (with known | |
| # labels) to evaluate the model. | |
| print 'Testing the full model...' | |
| print 'Test size =', len(labels_test) | |
| predicted_labels, (accuracy, mse, scc), p_val = svm_predict( | |
| labels_test, data_test, model, '-q -b 1') | |
| print 'Test accuracy =', accuracy | |
| def arange(start, stop, step=1.0): | |
| '''Like range(), but allows floats.''' | |
| while start < stop: | |
| yield start | |
| start += step | |
| def encode_params(C, gamma, cv=None): | |
| '''Construct a svm_parameter object, as used for svm_train. | |
| -q supress output | |
| -t 2 = RBF | |
| -b | |
| -c is C parameter | |
| -g is gamma parameter | |
| -v 5 is 5-fold cross-validation (does NOT return model, but CV accuracy) | |
| ''' | |
| if cv == None: | |
| return svm_parameter('-q -t 2 -c %f -b 1 -g %f' % (C, gamma)) | |
| else: | |
| return svm_parameter('-q -t 2 -c %f -b 1 -g %f -v %d' % (C, gamma, cv)) | |
| def grid_search(dimensions, evaluator): | |
| ''' A naive grid search, takes: | |
| - dimensions, a list of dimensions, each one of which is a list of | |
| values to try. | |
| - evaluator, a function mapping from coordinates to a value | |
| Returns the coordinates maximizing the evaluator. | |
| ''' | |
| best_value = None | |
| best_coordinates = None | |
| for coordinates in itertools.product(*dimensions): | |
| val = evaluator(*coordinates) | |
| if best_value is None or val > best_value: | |
| best_value = val | |
| best_coordinates = coordinates | |
| return best_value, best_coordinates | |
| # Write your transformation code here! | |
| # - Load the adults.csv data (I recommend Python's csv module) | |
| # - Extract the labels (in the last column) and transform them to numbers. Make | |
| # sure you remove the label column from the feature data! | |
| # - Transform the other columns | |
| # Eventually, assign these. See docstring for seven_steps_to_OK_performance. | |
| #labels = | |
| #data = | |
| # Note that you can also call this function with keyword parameters | |
| # tune_size and train_size if you want to test your code with something | |
| # quicker. Try tune_size=50 and train_size=500 (should take less than 10s.) | |
| seven_steps_to_OK_performance(labels, data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment