import random import matplotlib.pyplot as plt def load_data(path): datas = [] with open(path) as fp: lines = fp.readlines() for line in lines: tmp_data = [ 0 for _ in range(8) ] numbers = line.split() y = int(numbers[0]) for i in numbers[1:]: index, value = i.split(':') index = int(index) value = float(value) tmp_data[index-1] = value x = tmp_data datas.append({ 'x': x, 'y': y, }) return datas class Node(): def __init__(self, datas): self.datas = datas self.theta = None self.feature_i = None self.value = None self.right = None self.left = None def predict(self, data): if self.value != None: return self.value else: if data['x'][self.feature_i] <= self.theta: return self.left.predict(data) else: return self.right.predict(data) def decision_stump(self): def get_impurity(features_i_y_pair): if len(features_i_y_pair) == 0: return 0 y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair) impurity = 0 for data in features_i_y_pair: impurity += (data['y']-y_bar) ** 2 impurity = impurity / len(features_i_y_pair) return impurity different_y = set() for data in self.datas: different_y.add(data['y']) if len(different_y) == 1: self.value = list(different_y)[0] return min_impurity = 1e9 best_feature_i, best_theta = -100, -100 # find the best feature_i for feature_i in range(8): # only get feature_i # and sorted by feature_i, we want to get the best theta features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ] features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i']) # if all x are same if features_i_y_pair[0] == features_i_y_pair[-1]: continue thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ] # find the best theta for index, theta in enumerate(thetas): front_data = [] back_data = [] for data in features_i_y_pair: if data['feature_i'] <= theta: front_data.append(data) else: back_data.append(data) # get the impurity when (feature_i, theta) impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data) if impurity < min_impurity: min_impurity = impurity best_feature_i = feature_i best_theta = theta if best_feature_i != -100: self.theta = best_theta self.feature_i = best_feature_i else: # print("NO DECISION") self.value = sum([data['y'] for data in self.datas]) / len(self.datas) ''' print("feature_i: ", self.feature_i) print("Theta: ", self.theta) print("Value: ", self.value) for i in self.datas: print(" ", i) ''' def expand(self): left_data = [] for data in self.datas: if data['x'][self.feature_i] <= self.theta: left_data.append(data) right_data = [] for data in self.datas: if data['x'][self.feature_i] > self.theta: right_data.append(data) if len(right_data) == 0 or len(left_data) == 0: self.value = sum([data['y'] for data in self.datas]) / len(self.datas) else: self.left = Node(left_data) self.right = Node(right_data) self.left.decision_stump() self.right.decision_stump() if self.left.theta != None: self.left.expand() if self.right.theta != None: self.right.expand() # print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value)) def square_error(target, predict): return (predict-target) ** 2 if __name__ == '__main__': train = load_data('hw6_train.dat') test = load_data('hw6_test.dat') e_in_logs = [] e_out_logs = [] roots = [] FOREST_SIZE = 2000 for i in range(FOREST_SIZE): sample_size = int(len(train) / 2) bagging_train = random.sample(train, k=sample_size) root = Node(bagging_train) root.decision_stump() root.expand() roots.append(root) errors = 0 for data in train: predict_y = root.predict(data) error = square_error(data['y'], predict_y) errors += error print(i) print(" E_in:", errors/len(test)) e_in_logs.append(errors/len(test)) errors = 0 for data in test: predict_y = root.predict(data) error = square_error(data['y'], predict_y) errors += error print(i) print(" E_out:", errors/len(test)) e_out_logs.append(errors/len(test)) plt.scatter(e_in_logs, e_out_logs, c='b') e_in_logs = [] e_out_logs = [] errors = 0 for data in train: # predict with aggregation ans_sum = 0 for i in range(FOREST_SIZE): predict_y = roots[i].predict(data) ans_sum += predict_y predict_y = ans_sum / FOREST_SIZE error = square_error(data['y'], predict_y) errors += error e_in_logs.append(errors/len(train)) errors = 0 for data in test: # predict with aggregation ans_sum = 0 for i in range(FOREST_SIZE): predict_y = roots[i].predict(data) ans_sum += predict_y predict_y = ans_sum / FOREST_SIZE error = square_error(data['y'], predict_y) errors += error e_out_logs.append(errors/len(test)) plt.scatter(e_in_logs, e_out_logs, c='r', s=100) plt.savefig('p11.png')