From a73d25b509303a4244fdcf4a7823289a78e6bf11 Mon Sep 17 00:00:00 2001
From: Ting-Jun Wang <levi900227@gmail.com>
Date: Thu, 21 Dec 2023 08:16:09 +0800
Subject: [PATCH] feat: complete problem 10~12

---
 hw6/p10.py | 174 ++++++++++++++++++++++++++++++++++++++++++
 hw6/p11.py | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw6/p12.py | 198 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 591 insertions(+)
 create mode 100644 hw6/p10.py
 create mode 100644 hw6/p11.py
 create mode 100644 hw6/p12.py

diff --git a/hw6/p10.py b/hw6/p10.py
new file mode 100644
index 0000000..52ae3ca
--- /dev/null
+++ b/hw6/p10.py
@@ -0,0 +1,174 @@
+import random
+import matplotlib.pyplot as plt
+
+def load_data(path):
+    datas = []
+    with open(path) as fp:
+        lines = fp.readlines()
+
+        for line in lines:
+            tmp_data = [ 0 for _ in range(8) ]
+            numbers = line.split()
+            y = int(numbers[0])
+            for i in numbers[1:]:
+                index, value = i.split(':')
+                index = int(index)
+                value = float(value)
+                tmp_data[index-1] = value
+            x = tmp_data
+            datas.append({
+                'x': x,
+                'y': y,
+            })
+    return datas
+
+class Node():
+    def __init__(self, datas):
+        self.datas = datas
+        self.theta = None
+        self.feature_i = None
+
+        self.value = None
+
+        self.right = None
+        self.left = None
+
+    def predict(self, data):
+        if self.value != None:
+            return self.value
+        else:
+            if data['x'][self.feature_i] <= self.theta:
+                return self.left.predict(data)
+            else:
+                return self.right.predict(data)
+        
+    def decision_stump(self):
+        def get_impurity(features_i_y_pair):
+            if len(features_i_y_pair) == 0:
+                return 0
+            y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair)
+            
+            impurity = 0
+            for data in features_i_y_pair:
+                impurity += (data['y']-y_bar) ** 2
+            impurity = impurity / len(features_i_y_pair)
+            return impurity
+
+        different_y = set()
+        for data in self.datas:
+            different_y.add(data['y'])
+        if len(different_y) == 1:
+            self.value = list(different_y)[0]
+            return 
+        
+        min_impurity = 1e9
+        best_feature_i, best_theta = -100, -100
+
+        # find the best feature_i
+        for feature_i in range(8):
+            # only get feature_i
+            # and sorted by feature_i, we want to get the best theta
+            features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ]
+            features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i'])
+
+            # if all x are same
+            if features_i_y_pair[0] == features_i_y_pair[-1]:
+                continue
+            thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ]
+
+            # find  the best theta
+            for index, theta in enumerate(thetas):
+                front_data = []
+                back_data = []
+
+                for data in features_i_y_pair:
+                    if data['feature_i'] <= theta:
+                        front_data.append(data)
+                    else:
+                        back_data.append(data)
+
+                # get the impurity when (feature_i, theta)
+                impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data)
+                if impurity < min_impurity:
+                    min_impurity = impurity
+                    best_feature_i = feature_i
+                    best_theta = theta
+
+
+        if best_feature_i != -100:
+            self.theta = best_theta
+            self.feature_i = best_feature_i
+        else:
+            # print("NO DECISION")
+            self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
+
+        '''
+        print("feature_i: ", self.feature_i)
+        print("Theta: ", self.theta)
+        print("Value: ", self.value)
+        for i in self.datas:
+            print("    ", i)
+        '''
+
+
+    def expand(self):
+        left_data = []
+        for data in self.datas:
+            if data['x'][self.feature_i] <= self.theta:
+                left_data.append(data)
+
+        right_data = []
+        for data in self.datas:
+            if data['x'][self.feature_i] > self.theta:
+                right_data.append(data)
+
+
+        if len(right_data) == 0 or len(left_data) == 0:
+            self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
+        else:
+            self.left = Node(left_data)
+            self.right = Node(right_data)
+
+            self.left.decision_stump()
+            self.right.decision_stump()
+
+            if self.left.theta != None:
+                self.left.expand()
+            
+            if self.right.theta != None:
+                self.right.expand()
+
+        # print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value))
+    
+
+def square_error(target, predict):
+    return (predict-target) ** 2
+        
+
+if __name__ == '__main__':
+    train = load_data('hw6_train.dat')
+    test = load_data('hw6_test.dat')
+
+    error_logs = []
+    for i in range(2000):
+        sample_size = int(len(train) / 2)
+        bagging_train = random.sample(train, k=sample_size)
+
+        root = Node(bagging_train)
+        root.decision_stump()
+        root.expand()
+
+        errors = 0
+        for data in test:
+            predict_y = root.predict(data)
+            error = square_error(data['y'], predict_y)
+            errors += error
+        print(i)
+        print("    ANS:", errors/len(test))
+        error_logs.append(errors/len(test))
+    
+    plt.hist(error_logs)
+    plt.savefig('p10.png')
+            
+                
+
diff --git a/hw6/p11.py b/hw6/p11.py
new file mode 100644
index 0000000..02be2ae
--- /dev/null
+++ b/hw6/p11.py
@@ -0,0 +1,219 @@
+import random
+import matplotlib.pyplot as plt
+
+def load_data(path):
+    datas = []
+    with open(path) as fp:
+        lines = fp.readlines()
+
+        for line in lines:
+            tmp_data = [ 0 for _ in range(8) ]
+            numbers = line.split()
+            y = int(numbers[0])
+            for i in numbers[1:]:
+                index, value = i.split(':')
+                index = int(index)
+                value = float(value)
+                tmp_data[index-1] = value
+            x = tmp_data
+            datas.append({
+                'x': x,
+                'y': y,
+            })
+    return datas
+
+class Node():
+    def __init__(self, datas):
+        self.datas = datas
+        self.theta = None
+        self.feature_i = None
+
+        self.value = None
+
+        self.right = None
+        self.left = None
+
+    def predict(self, data):
+        if self.value != None:
+            return self.value
+        else:
+            if data['x'][self.feature_i] <= self.theta:
+                return self.left.predict(data)
+            else:
+                return self.right.predict(data)
+        
+    def decision_stump(self):
+        def get_impurity(features_i_y_pair):
+            if len(features_i_y_pair) == 0:
+                return 0
+            y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair)
+            
+            impurity = 0
+            for data in features_i_y_pair:
+                impurity += (data['y']-y_bar) ** 2
+            impurity = impurity / len(features_i_y_pair)
+            return impurity
+
+        different_y = set()
+        for data in self.datas:
+            different_y.add(data['y'])
+        if len(different_y) == 1:
+            self.value = list(different_y)[0]
+            return 
+        
+        min_impurity = 1e9
+        best_feature_i, best_theta = -100, -100
+
+        # find the best feature_i
+        for feature_i in range(8):
+            # only get feature_i
+            # and sorted by feature_i, we want to get the best theta
+            features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ]
+            features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i'])
+
+            # if all x are same
+            if features_i_y_pair[0] == features_i_y_pair[-1]:
+                continue
+            thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ]
+
+            # find  the best theta
+            for index, theta in enumerate(thetas):
+                front_data = []
+                back_data = []
+
+                for data in features_i_y_pair:
+                    if data['feature_i'] <= theta:
+                        front_data.append(data)
+                    else:
+                        back_data.append(data)
+
+                # get the impurity when (feature_i, theta)
+                impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data)
+                if impurity < min_impurity:
+                    min_impurity = impurity
+                    best_feature_i = feature_i
+                    best_theta = theta
+
+
+        if best_feature_i != -100:
+            self.theta = best_theta
+            self.feature_i = best_feature_i
+        else:
+            # print("NO DECISION")
+            self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
+
+        '''
+        print("feature_i: ", self.feature_i)
+        print("Theta: ", self.theta)
+        print("Value: ", self.value)
+        for i in self.datas:
+            print("    ", i)
+        '''
+
+
+    def expand(self):
+        left_data = []
+        for data in self.datas:
+            if data['x'][self.feature_i] <= self.theta:
+                left_data.append(data)
+
+        right_data = []
+        for data in self.datas:
+            if data['x'][self.feature_i] > self.theta:
+                right_data.append(data)
+
+
+        if len(right_data) == 0 or len(left_data) == 0:
+            self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
+        else:
+            self.left = Node(left_data)
+            self.right = Node(right_data)
+
+            self.left.decision_stump()
+            self.right.decision_stump()
+
+            if self.left.theta != None:
+                self.left.expand()
+            
+            if self.right.theta != None:
+                self.right.expand()
+
+        # print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value))
+    
+
+def square_error(target, predict):
+    return (predict-target) ** 2
+        
+
+if __name__ == '__main__':
+    train = load_data('hw6_train.dat')
+    test = load_data('hw6_test.dat')
+
+    e_in_logs = []
+    e_out_logs = []
+    roots = []
+    FOREST_SIZE = 2000
+    for i in range(FOREST_SIZE):
+        sample_size = int(len(train) / 2)
+        bagging_train = random.sample(train, k=sample_size)
+
+        root = Node(bagging_train)
+        root.decision_stump()
+        root.expand()
+        roots.append(root)
+
+        errors = 0
+        for data in train:
+            predict_y = root.predict(data)
+            error = square_error(data['y'], predict_y)
+            errors += error
+        print(i)
+        print("    E_in:", errors/len(test))
+        e_in_logs.append(errors/len(test))
+        
+        errors = 0
+        for data in test:
+            predict_y = root.predict(data)
+            error = square_error(data['y'], predict_y)
+            errors += error
+        print(i)
+        print("    E_out:", errors/len(test))
+        e_out_logs.append(errors/len(test))
+    
+    plt.scatter(e_in_logs, e_out_logs, c='b')
+
+    e_in_logs = []
+    e_out_logs = []
+
+    errors = 0
+    for data in train:
+        # predict with aggregation
+        ans_sum = 0
+        for i in range(FOREST_SIZE):
+            predict_y = roots[i].predict(data)
+            ans_sum += predict_y
+        predict_y = ans_sum / FOREST_SIZE
+        error = square_error(data['y'], predict_y)
+        errors += error
+    e_in_logs.append(errors/len(train))
+        
+    errors = 0
+    for data in test:
+        # predict with aggregation
+        ans_sum = 0
+        for i in range(FOREST_SIZE):
+            predict_y = roots[i].predict(data)
+            ans_sum += predict_y
+        predict_y = ans_sum / FOREST_SIZE
+        error = square_error(data['y'], predict_y)
+        errors += error
+    e_out_logs.append(errors/len(test))
+
+    plt.scatter(e_in_logs, e_out_logs, c='r', s=100)
+
+    plt.savefig('p11.png')
+    
+
+            
+                
+
diff --git a/hw6/p12.py b/hw6/p12.py
new file mode 100644
index 0000000..19e6d25
--- /dev/null
+++ b/hw6/p12.py
@@ -0,0 +1,198 @@
+import random
+import matplotlib.pyplot as plt
+
+def load_data(path):
+    datas = []
+    with open(path) as fp:
+        lines = fp.readlines()
+
+        for line in lines:
+            tmp_data = [ 0 for _ in range(8) ]
+            numbers = line.split()
+            y = int(numbers[0])
+            for i in numbers[1:]:
+                index, value = i.split(':')
+                index = int(index)
+                value = float(value)
+                tmp_data[index-1] = value
+            x = tmp_data
+            datas.append({
+                'x': x,
+                'y': y,
+            })
+    return datas
+
+class Node():
+    def __init__(self, datas):
+        self.datas = datas
+        self.theta = None
+        self.feature_i = None
+
+        self.value = None
+
+        self.right = None
+        self.left = None
+
+    def predict(self, data):
+        if self.value != None:
+            return self.value
+        else:
+            if data['x'][self.feature_i] <= self.theta:
+                return self.left.predict(data)
+            else:
+                return self.right.predict(data)
+        
+    def decision_stump(self):
+        def get_impurity(features_i_y_pair):
+            if len(features_i_y_pair) == 0:
+                return 0
+            y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair)
+            
+            impurity = 0
+            for data in features_i_y_pair:
+                impurity += (data['y']-y_bar) ** 2
+            impurity = impurity / len(features_i_y_pair)
+            return impurity
+
+        different_y = set()
+        for data in self.datas:
+            different_y.add(data['y'])
+        if len(different_y) == 1:
+            self.value = list(different_y)[0]
+            return 
+        
+        min_impurity = 1e9
+        best_feature_i, best_theta = -100, -100
+
+        # find the best feature_i
+        for feature_i in range(8):
+            # only get feature_i
+            # and sorted by feature_i, we want to get the best theta
+            features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ]
+            features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i'])
+
+            # if all x are same
+            if features_i_y_pair[0] == features_i_y_pair[-1]:
+                continue
+            thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ]
+
+            # find  the best theta
+            for index, theta in enumerate(thetas):
+                front_data = []
+                back_data = []
+
+                for data in features_i_y_pair:
+                    if data['feature_i'] <= theta:
+                        front_data.append(data)
+                    else:
+                        back_data.append(data)
+
+                # get the impurity when (feature_i, theta)
+                impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data)
+                if impurity < min_impurity:
+                    min_impurity = impurity
+                    best_feature_i = feature_i
+                    best_theta = theta
+
+
+        if best_feature_i != -100:
+            self.theta = best_theta
+            self.feature_i = best_feature_i
+        else:
+            # print("NO DECISION")
+            self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
+
+        '''
+        print("feature_i: ", self.feature_i)
+        print("Theta: ", self.theta)
+        print("Value: ", self.value)
+        for i in self.datas:
+            print("    ", i)
+        '''
+
+
+    def expand(self):
+        left_data = []
+        for data in self.datas:
+            if data['x'][self.feature_i] <= self.theta:
+                left_data.append(data)
+
+        right_data = []
+        for data in self.datas:
+            if data['x'][self.feature_i] > self.theta:
+                right_data.append(data)
+
+
+        if len(right_data) == 0 or len(left_data) == 0:
+            self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
+        else:
+            self.left = Node(left_data)
+            self.right = Node(right_data)
+
+            self.left.decision_stump()
+            self.right.decision_stump()
+
+            if self.left.theta != None:
+                self.left.expand()
+            
+            if self.right.theta != None:
+                self.right.expand()
+
+        # print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value))
+    
+
+def square_error(target, predict):
+    return (predict-target) ** 2
+        
+
+if __name__ == '__main__':
+    train = load_data('hw6_train.dat')
+    test = load_data('hw6_test.dat')
+
+    e_in_logs = []
+    e_out_logs = []
+    roots = []
+    FOREST_SIZE = 2000
+    for i in range(FOREST_SIZE):
+        sample_size = int(len(train) / 2)
+        bagging_train = random.sample(train, k=sample_size)
+
+        root = Node(bagging_train)
+        root.decision_stump()
+        root.expand()
+        roots.append(root)
+
+        errors = 0
+        for data in test:
+            predict_y = root.predict(data)
+            error = square_error(data['y'], predict_y)
+            errors += error
+        print(i)
+        print("    E_out:", errors/len(test))
+        e_out_logs.append(errors/len(test))
+    
+    plt.plot(list(range(FOREST_SIZE)), e_out_logs, c='b')
+
+    e_in_logs = []
+    e_out_logs = []
+
+    for T in range(1, FOREST_SIZE+1):
+        errors = 0
+        for data in test:
+            # predict with aggregation
+            ans_sum = 0
+            for i in range(T):
+                predict_y = roots[i].predict(data)
+                ans_sum += predict_y
+            predict_y = ans_sum / T
+            error = square_error(data['y'], predict_y)
+            errors += error
+        e_out_logs.append(errors/len(test))
+
+    plt.plot(list(range(FOREST_SIZE)), e_out_logs, c='r')
+    plt.savefig('p12.png')
+    
+
+            
+                
+