From a73d25b509303a4244fdcf4a7823289a78e6bf11 Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Thu, 21 Dec 2023 08:16:09 +0800 Subject: [PATCH] feat: complete problem 10~12 --- hw6/p10.py | 174 ++++++++++++++++++++++++++++++++++++++++++ hw6/p11.py | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++++ hw6/p12.py | 198 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 591 insertions(+) create mode 100644 hw6/p10.py create mode 100644 hw6/p11.py create mode 100644 hw6/p12.py diff --git a/hw6/p10.py b/hw6/p10.py new file mode 100644 index 0000000..52ae3ca --- /dev/null +++ b/hw6/p10.py @@ -0,0 +1,174 @@ +import random +import matplotlib.pyplot as plt + +def load_data(path): + datas = [] + with open(path) as fp: + lines = fp.readlines() + + for line in lines: + tmp_data = [ 0 for _ in range(8) ] + numbers = line.split() + y = int(numbers[0]) + for i in numbers[1:]: + index, value = i.split(':') + index = int(index) + value = float(value) + tmp_data[index-1] = value + x = tmp_data + datas.append({ + 'x': x, + 'y': y, + }) + return datas + +class Node(): + def __init__(self, datas): + self.datas = datas + self.theta = None + self.feature_i = None + + self.value = None + + self.right = None + self.left = None + + def predict(self, data): + if self.value != None: + return self.value + else: + if data['x'][self.feature_i] <= self.theta: + return self.left.predict(data) + else: + return self.right.predict(data) + + def decision_stump(self): + def get_impurity(features_i_y_pair): + if len(features_i_y_pair) == 0: + return 0 + y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair) + + impurity = 0 + for data in features_i_y_pair: + impurity += (data['y']-y_bar) ** 2 + impurity = impurity / len(features_i_y_pair) + return impurity + + different_y = set() + for data in self.datas: + different_y.add(data['y']) + if len(different_y) == 1: + self.value = list(different_y)[0] + return + + min_impurity = 1e9 + best_feature_i, best_theta = -100, -100 + + # find the best feature_i + for feature_i in range(8): + # only get feature_i + # and sorted by feature_i, we want to get the best theta + features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ] + features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i']) + + # if all x are same + if features_i_y_pair[0] == features_i_y_pair[-1]: + continue + thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ] + + # find the best theta + for index, theta in enumerate(thetas): + front_data = [] + back_data = [] + + for data in features_i_y_pair: + if data['feature_i'] <= theta: + front_data.append(data) + else: + back_data.append(data) + + # get the impurity when (feature_i, theta) + impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data) + if impurity < min_impurity: + min_impurity = impurity + best_feature_i = feature_i + best_theta = theta + + + if best_feature_i != -100: + self.theta = best_theta + self.feature_i = best_feature_i + else: + # print("NO DECISION") + self.value = sum([data['y'] for data in self.datas]) / len(self.datas) + + ''' + print("feature_i: ", self.feature_i) + print("Theta: ", self.theta) + print("Value: ", self.value) + for i in self.datas: + print(" ", i) + ''' + + + def expand(self): + left_data = [] + for data in self.datas: + if data['x'][self.feature_i] <= self.theta: + left_data.append(data) + + right_data = [] + for data in self.datas: + if data['x'][self.feature_i] > self.theta: + right_data.append(data) + + + if len(right_data) == 0 or len(left_data) == 0: + self.value = sum([data['y'] for data in self.datas]) / len(self.datas) + else: + self.left = Node(left_data) + self.right = Node(right_data) + + self.left.decision_stump() + self.right.decision_stump() + + if self.left.theta != None: + self.left.expand() + + if self.right.theta != None: + self.right.expand() + + # print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value)) + + +def square_error(target, predict): + return (predict-target) ** 2 + + +if __name__ == '__main__': + train = load_data('hw6_train.dat') + test = load_data('hw6_test.dat') + + error_logs = [] + for i in range(2000): + sample_size = int(len(train) / 2) + bagging_train = random.sample(train, k=sample_size) + + root = Node(bagging_train) + root.decision_stump() + root.expand() + + errors = 0 + for data in test: + predict_y = root.predict(data) + error = square_error(data['y'], predict_y) + errors += error + print(i) + print(" ANS:", errors/len(test)) + error_logs.append(errors/len(test)) + + plt.hist(error_logs) + plt.savefig('p10.png') + + + diff --git a/hw6/p11.py b/hw6/p11.py new file mode 100644 index 0000000..02be2ae --- /dev/null +++ b/hw6/p11.py @@ -0,0 +1,219 @@ +import random +import matplotlib.pyplot as plt + +def load_data(path): + datas = [] + with open(path) as fp: + lines = fp.readlines() + + for line in lines: + tmp_data = [ 0 for _ in range(8) ] + numbers = line.split() + y = int(numbers[0]) + for i in numbers[1:]: + index, value = i.split(':') + index = int(index) + value = float(value) + tmp_data[index-1] = value + x = tmp_data + datas.append({ + 'x': x, + 'y': y, + }) + return datas + +class Node(): + def __init__(self, datas): + self.datas = datas + self.theta = None + self.feature_i = None + + self.value = None + + self.right = None + self.left = None + + def predict(self, data): + if self.value != None: + return self.value + else: + if data['x'][self.feature_i] <= self.theta: + return self.left.predict(data) + else: + return self.right.predict(data) + + def decision_stump(self): + def get_impurity(features_i_y_pair): + if len(features_i_y_pair) == 0: + return 0 + y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair) + + impurity = 0 + for data in features_i_y_pair: + impurity += (data['y']-y_bar) ** 2 + impurity = impurity / len(features_i_y_pair) + return impurity + + different_y = set() + for data in self.datas: + different_y.add(data['y']) + if len(different_y) == 1: + self.value = list(different_y)[0] + return + + min_impurity = 1e9 + best_feature_i, best_theta = -100, -100 + + # find the best feature_i + for feature_i in range(8): + # only get feature_i + # and sorted by feature_i, we want to get the best theta + features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ] + features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i']) + + # if all x are same + if features_i_y_pair[0] == features_i_y_pair[-1]: + continue + thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ] + + # find the best theta + for index, theta in enumerate(thetas): + front_data = [] + back_data = [] + + for data in features_i_y_pair: + if data['feature_i'] <= theta: + front_data.append(data) + else: + back_data.append(data) + + # get the impurity when (feature_i, theta) + impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data) + if impurity < min_impurity: + min_impurity = impurity + best_feature_i = feature_i + best_theta = theta + + + if best_feature_i != -100: + self.theta = best_theta + self.feature_i = best_feature_i + else: + # print("NO DECISION") + self.value = sum([data['y'] for data in self.datas]) / len(self.datas) + + ''' + print("feature_i: ", self.feature_i) + print("Theta: ", self.theta) + print("Value: ", self.value) + for i in self.datas: + print(" ", i) + ''' + + + def expand(self): + left_data = [] + for data in self.datas: + if data['x'][self.feature_i] <= self.theta: + left_data.append(data) + + right_data = [] + for data in self.datas: + if data['x'][self.feature_i] > self.theta: + right_data.append(data) + + + if len(right_data) == 0 or len(left_data) == 0: + self.value = sum([data['y'] for data in self.datas]) / len(self.datas) + else: + self.left = Node(left_data) + self.right = Node(right_data) + + self.left.decision_stump() + self.right.decision_stump() + + if self.left.theta != None: + self.left.expand() + + if self.right.theta != None: + self.right.expand() + + # print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value)) + + +def square_error(target, predict): + return (predict-target) ** 2 + + +if __name__ == '__main__': + train = load_data('hw6_train.dat') + test = load_data('hw6_test.dat') + + e_in_logs = [] + e_out_logs = [] + roots = [] + FOREST_SIZE = 2000 + for i in range(FOREST_SIZE): + sample_size = int(len(train) / 2) + bagging_train = random.sample(train, k=sample_size) + + root = Node(bagging_train) + root.decision_stump() + root.expand() + roots.append(root) + + errors = 0 + for data in train: + predict_y = root.predict(data) + error = square_error(data['y'], predict_y) + errors += error + print(i) + print(" E_in:", errors/len(test)) + e_in_logs.append(errors/len(test)) + + errors = 0 + for data in test: + predict_y = root.predict(data) + error = square_error(data['y'], predict_y) + errors += error + print(i) + print(" E_out:", errors/len(test)) + e_out_logs.append(errors/len(test)) + + plt.scatter(e_in_logs, e_out_logs, c='b') + + e_in_logs = [] + e_out_logs = [] + + errors = 0 + for data in train: + # predict with aggregation + ans_sum = 0 + for i in range(FOREST_SIZE): + predict_y = roots[i].predict(data) + ans_sum += predict_y + predict_y = ans_sum / FOREST_SIZE + error = square_error(data['y'], predict_y) + errors += error + e_in_logs.append(errors/len(train)) + + errors = 0 + for data in test: + # predict with aggregation + ans_sum = 0 + for i in range(FOREST_SIZE): + predict_y = roots[i].predict(data) + ans_sum += predict_y + predict_y = ans_sum / FOREST_SIZE + error = square_error(data['y'], predict_y) + errors += error + e_out_logs.append(errors/len(test)) + + plt.scatter(e_in_logs, e_out_logs, c='r', s=100) + + plt.savefig('p11.png') + + + + + diff --git a/hw6/p12.py b/hw6/p12.py new file mode 100644 index 0000000..19e6d25 --- /dev/null +++ b/hw6/p12.py @@ -0,0 +1,198 @@ +import random +import matplotlib.pyplot as plt + +def load_data(path): + datas = [] + with open(path) as fp: + lines = fp.readlines() + + for line in lines: + tmp_data = [ 0 for _ in range(8) ] + numbers = line.split() + y = int(numbers[0]) + for i in numbers[1:]: + index, value = i.split(':') + index = int(index) + value = float(value) + tmp_data[index-1] = value + x = tmp_data + datas.append({ + 'x': x, + 'y': y, + }) + return datas + +class Node(): + def __init__(self, datas): + self.datas = datas + self.theta = None + self.feature_i = None + + self.value = None + + self.right = None + self.left = None + + def predict(self, data): + if self.value != None: + return self.value + else: + if data['x'][self.feature_i] <= self.theta: + return self.left.predict(data) + else: + return self.right.predict(data) + + def decision_stump(self): + def get_impurity(features_i_y_pair): + if len(features_i_y_pair) == 0: + return 0 + y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair) + + impurity = 0 + for data in features_i_y_pair: + impurity += (data['y']-y_bar) ** 2 + impurity = impurity / len(features_i_y_pair) + return impurity + + different_y = set() + for data in self.datas: + different_y.add(data['y']) + if len(different_y) == 1: + self.value = list(different_y)[0] + return + + min_impurity = 1e9 + best_feature_i, best_theta = -100, -100 + + # find the best feature_i + for feature_i in range(8): + # only get feature_i + # and sorted by feature_i, we want to get the best theta + features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ] + features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i']) + + # if all x are same + if features_i_y_pair[0] == features_i_y_pair[-1]: + continue + thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ] + + # find the best theta + for index, theta in enumerate(thetas): + front_data = [] + back_data = [] + + for data in features_i_y_pair: + if data['feature_i'] <= theta: + front_data.append(data) + else: + back_data.append(data) + + # get the impurity when (feature_i, theta) + impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data) + if impurity < min_impurity: + min_impurity = impurity + best_feature_i = feature_i + best_theta = theta + + + if best_feature_i != -100: + self.theta = best_theta + self.feature_i = best_feature_i + else: + # print("NO DECISION") + self.value = sum([data['y'] for data in self.datas]) / len(self.datas) + + ''' + print("feature_i: ", self.feature_i) + print("Theta: ", self.theta) + print("Value: ", self.value) + for i in self.datas: + print(" ", i) + ''' + + + def expand(self): + left_data = [] + for data in self.datas: + if data['x'][self.feature_i] <= self.theta: + left_data.append(data) + + right_data = [] + for data in self.datas: + if data['x'][self.feature_i] > self.theta: + right_data.append(data) + + + if len(right_data) == 0 or len(left_data) == 0: + self.value = sum([data['y'] for data in self.datas]) / len(self.datas) + else: + self.left = Node(left_data) + self.right = Node(right_data) + + self.left.decision_stump() + self.right.decision_stump() + + if self.left.theta != None: + self.left.expand() + + if self.right.theta != None: + self.right.expand() + + # print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value)) + + +def square_error(target, predict): + return (predict-target) ** 2 + + +if __name__ == '__main__': + train = load_data('hw6_train.dat') + test = load_data('hw6_test.dat') + + e_in_logs = [] + e_out_logs = [] + roots = [] + FOREST_SIZE = 2000 + for i in range(FOREST_SIZE): + sample_size = int(len(train) / 2) + bagging_train = random.sample(train, k=sample_size) + + root = Node(bagging_train) + root.decision_stump() + root.expand() + roots.append(root) + + errors = 0 + for data in test: + predict_y = root.predict(data) + error = square_error(data['y'], predict_y) + errors += error + print(i) + print(" E_out:", errors/len(test)) + e_out_logs.append(errors/len(test)) + + plt.plot(list(range(FOREST_SIZE)), e_out_logs, c='b') + + e_in_logs = [] + e_out_logs = [] + + for T in range(1, FOREST_SIZE+1): + errors = 0 + for data in test: + # predict with aggregation + ans_sum = 0 + for i in range(T): + predict_y = roots[i].predict(data) + ans_sum += predict_y + predict_y = ans_sum / T + error = square_error(data['y'], predict_y) + errors += error + e_out_logs.append(errors/len(test)) + + plt.plot(list(range(FOREST_SIZE)), e_out_logs, c='r') + plt.savefig('p12.png') + + + + +