diff --git a/hw4/hw4_10.py b/hw4/hw4_10.py new file mode 100644 index 0000000..38c0fa3 --- /dev/null +++ b/hw4/hw4_10.py @@ -0,0 +1,63 @@ +import numpy as np +from liblinear.liblinearutil import * +import math + +FILENAME = "hw4_train.dat" + +def read_data(filename): + with open(filename) as fp: + lines = fp.readlines() + x, y = [], [] + for line in lines: + numbers = [ float(i) for i in line.split() ] + x.append(numbers[:-1]) + y.append(int(numbers[-1])) + return x, y + +def format(features): + ''' + change to LIBSVM format + ''' + results = [] + for feature in features: + result = {} + for index, value in enumerate(feature): + if value != 0.0: + result[index+1] = value + results.append(result) + return results + +def error(gt, pred): + err = 0 + for index in range(len(gt)): + err = (err+1) if gt[index]!=pred[index] else err + return err/len(gt) + +x, y = read_data(FILENAME) +x = format(x) +prob = problem(y, x) +lambda_powers = [-6, -4, -2, 0, 2] + +results = [] +for lambda_power in lambda_powers: + lambda_value = 10 ** lambda_power + param_C = 1/(2*lambda_value) + param = parameter('-s 0 -c {} -e 0.000001 -q'.format(param_C)) + model = train(prob, param) + p_label, p_acc, p_val = predict(y, x, model) + err = error(y, p_label) + print("0/1 error: ", err) + print() + results.append({'lambda': lambda_power, 'error': err}) + +ans, min_err = None, 1 +for i in results: + if i['error'] <= min_err: + min_err = i['error'] + ans = i + +print("the largest lambda: {}, log_10(lambda*): {}".format(10**ans['lambda'], ans['lambda'])) + + + + diff --git a/hw4/hw4_11.py b/hw4/hw4_11.py new file mode 100644 index 0000000..1eacc73 --- /dev/null +++ b/hw4/hw4_11.py @@ -0,0 +1,83 @@ +import numpy as np +import datetime +import random +from liblinear.liblinearutil import * +import matplotlib.pyplot as plt + +FILENAME = "hw4_train.dat" + +def read_data(filename): + with open(filename) as fp: + lines = fp.readlines() + x, y = [], [] + for line in lines: + numbers = [ float(i) for i in line.split() ] + x.append(numbers[:-1]) + y.append(int(numbers[-1])) + return x, y + +def format(features): + ''' + change to LIBSVM format + ''' + results = [] + for feature in features: + result = {} + for index, value in enumerate(feature): + if value != 0.0: + result[index+1] = value + results.append(result) + return results + +def error(gt, pred): + err = 0 + for index in range(len(gt)): + err = (err+1) if gt[index]!=pred[index] else err + return err/len(gt) + +def new_split(x, y): + random.seed(datetime.datetime.now().timestamp()) + data = list(zip(x, y)) + random.shuffle(data) + x, y = zip(*data) + train_x, val_x = x[:120], x[120:] + train_y, val_y = y[:120], y[120:] + return (train_x, train_y), (val_x, val_y) + +x, y = read_data(FILENAME) +x = format(x) +log_lambda = [] +for _ in range(128): + (train_x, train_y), (val_x, val_y) = new_split(x, y) + prob = problem(train_y, train_x) + + lambda_powers = [-6, -4, -2, 0, 2] + results = [] + for lambda_power in lambda_powers: + lambda_value = 10 ** lambda_power + param_C = 1/(2*lambda_value) + param = parameter('-s 0 -c {} -e 0.000001 -q'.format(param_C)) + model = train(prob, param) + p_label, p_acc, p_val = predict(val_y, val_x, model) + err = error(val_y, p_label) + print("0/1 error: ", err) + print() + results.append({'lambda': lambda_power, 'error': err}) + + ans, min_err = None, 1 + for i in results: + if i['error'] <= min_err: + min_err = i['error'] + ans = i + + print("the largest lambda: {}, log_10(lambda*): {}".format(10**ans['lambda'], ans['lambda'])) + print() + log_lambda.append(ans['lambda']) + + +plt.hist(log_lambda) +plt.savefig("hw4_11.png") + + + + diff --git a/hw4/hw4_12.py b/hw4/hw4_12.py new file mode 100644 index 0000000..edb45e9 --- /dev/null +++ b/hw4/hw4_12.py @@ -0,0 +1,105 @@ +import numpy as np +import datetime +import random +from liblinear.liblinearutil import * +import matplotlib.pyplot as plt + +FILENAME = "hw4_train.dat" + +def read_data(filename): + with open(filename) as fp: + lines = fp.readlines() + x, y = [], [] + for line in lines: + numbers = [ float(i) for i in line.split() ] + x.append(numbers[:-1]) + y.append(int(numbers[-1])) + return x, y + +def format(features): + ''' + change to LIBSVM format + ''' + results = [] + for feature in features: + result = {} + for index, value in enumerate(feature): + if value != 0.0: + result[index+1] = value + results.append(result) + return results + +def error(gt, pred): + err = 0 + for index in range(len(gt)): + err = (err+1) if gt[index]!=pred[index] else err + return err/len(gt) + +def new_split(x, y): + random.seed(datetime.datetime.now().timestamp()) + data = list(zip(x, y)) + random.shuffle(data) + x, y = zip(*data) + + folds = [] + head, tail = 0, 40 + while head < len(x): + folds.append( + (x[head:tail], y[head:tail]) + ) + head += 40 + tail += 40 + + return folds + +x, y = read_data(FILENAME) +x = format(x) +log_lambda = [] +lambda_powers = [-6, -4, -2, 0, 2] +for _ in range(128): + folds = new_split(x, y) + errors = [ 0 for _ in range(len(lambda_powers)) ] + results = [] + for val_index in range(len(folds)): + train_x, train_y = [], [] + val_x, val_y = [], [] + + for i in range(len(folds)): + if i == val_index: + val_x = folds[i][0] + val_y = folds[i][1] + else: + train_x += folds[i][0] + train_y += folds[i][1] + + prob = problem(train_y, train_x) + + for index, lambda_power in enumerate(lambda_powers): + lambda_value = 10 ** lambda_power + param_C = 1/(2*lambda_value) + param = parameter('-s 0 -c {} -e 0.000001 -q'.format(param_C)) + model = train(prob, param) + p_label, p_acc, p_val = predict(val_y, val_x, model) + err = error(val_y, p_label) + errors[index] += err + + for index, lambda_power in enumerate(lambda_powers): + results.append({'lambda': lambda_power, 'error': errors[index]/len(folds)}) + + ans, min_err = None, 1 + for i in results: + if i['error'] <= min_err: + min_err = i['error'] + ans = i + + print("the largest lambda: {}, log_10(lambda*): {}".format(10**ans['lambda'], ans['lambda'])) + print() + log_lambda.append(ans['lambda']) + + +plt.hist(log_lambda) +plt.savefig("hw4_12.png") + + + +