feat: complete problem 10~12

This commit is contained in:
Ting-Jun Wang 2023-12-21 08:16:09 +08:00
parent e6219a0c81
commit a73d25b509
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354
3 changed files with 591 additions and 0 deletions

174
hw6/p10.py Normal file
View File

@ -0,0 +1,174 @@
import random
import matplotlib.pyplot as plt
def load_data(path):
datas = []
with open(path) as fp:
lines = fp.readlines()
for line in lines:
tmp_data = [ 0 for _ in range(8) ]
numbers = line.split()
y = int(numbers[0])
for i in numbers[1:]:
index, value = i.split(':')
index = int(index)
value = float(value)
tmp_data[index-1] = value
x = tmp_data
datas.append({
'x': x,
'y': y,
})
return datas
class Node():
def __init__(self, datas):
self.datas = datas
self.theta = None
self.feature_i = None
self.value = None
self.right = None
self.left = None
def predict(self, data):
if self.value != None:
return self.value
else:
if data['x'][self.feature_i] <= self.theta:
return self.left.predict(data)
else:
return self.right.predict(data)
def decision_stump(self):
def get_impurity(features_i_y_pair):
if len(features_i_y_pair) == 0:
return 0
y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair)
impurity = 0
for data in features_i_y_pair:
impurity += (data['y']-y_bar) ** 2
impurity = impurity / len(features_i_y_pair)
return impurity
different_y = set()
for data in self.datas:
different_y.add(data['y'])
if len(different_y) == 1:
self.value = list(different_y)[0]
return
min_impurity = 1e9
best_feature_i, best_theta = -100, -100
# find the best feature_i
for feature_i in range(8):
# only get feature_i
# and sorted by feature_i, we want to get the best theta
features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ]
features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i'])
# if all x are same
if features_i_y_pair[0] == features_i_y_pair[-1]:
continue
thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ]
# find the best theta
for index, theta in enumerate(thetas):
front_data = []
back_data = []
for data in features_i_y_pair:
if data['feature_i'] <= theta:
front_data.append(data)
else:
back_data.append(data)
# get the impurity when (feature_i, theta)
impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data)
if impurity < min_impurity:
min_impurity = impurity
best_feature_i = feature_i
best_theta = theta
if best_feature_i != -100:
self.theta = best_theta
self.feature_i = best_feature_i
else:
# print("NO DECISION")
self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
'''
print("feature_i: ", self.feature_i)
print("Theta: ", self.theta)
print("Value: ", self.value)
for i in self.datas:
print(" ", i)
'''
def expand(self):
left_data = []
for data in self.datas:
if data['x'][self.feature_i] <= self.theta:
left_data.append(data)
right_data = []
for data in self.datas:
if data['x'][self.feature_i] > self.theta:
right_data.append(data)
if len(right_data) == 0 or len(left_data) == 0:
self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
else:
self.left = Node(left_data)
self.right = Node(right_data)
self.left.decision_stump()
self.right.decision_stump()
if self.left.theta != None:
self.left.expand()
if self.right.theta != None:
self.right.expand()
# print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value))
def square_error(target, predict):
return (predict-target) ** 2
if __name__ == '__main__':
train = load_data('hw6_train.dat')
test = load_data('hw6_test.dat')
error_logs = []
for i in range(2000):
sample_size = int(len(train) / 2)
bagging_train = random.sample(train, k=sample_size)
root = Node(bagging_train)
root.decision_stump()
root.expand()
errors = 0
for data in test:
predict_y = root.predict(data)
error = square_error(data['y'], predict_y)
errors += error
print(i)
print(" ANS:", errors/len(test))
error_logs.append(errors/len(test))
plt.hist(error_logs)
plt.savefig('p10.png')

219
hw6/p11.py Normal file
View File

@ -0,0 +1,219 @@
import random
import matplotlib.pyplot as plt
def load_data(path):
datas = []
with open(path) as fp:
lines = fp.readlines()
for line in lines:
tmp_data = [ 0 for _ in range(8) ]
numbers = line.split()
y = int(numbers[0])
for i in numbers[1:]:
index, value = i.split(':')
index = int(index)
value = float(value)
tmp_data[index-1] = value
x = tmp_data
datas.append({
'x': x,
'y': y,
})
return datas
class Node():
def __init__(self, datas):
self.datas = datas
self.theta = None
self.feature_i = None
self.value = None
self.right = None
self.left = None
def predict(self, data):
if self.value != None:
return self.value
else:
if data['x'][self.feature_i] <= self.theta:
return self.left.predict(data)
else:
return self.right.predict(data)
def decision_stump(self):
def get_impurity(features_i_y_pair):
if len(features_i_y_pair) == 0:
return 0
y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair)
impurity = 0
for data in features_i_y_pair:
impurity += (data['y']-y_bar) ** 2
impurity = impurity / len(features_i_y_pair)
return impurity
different_y = set()
for data in self.datas:
different_y.add(data['y'])
if len(different_y) == 1:
self.value = list(different_y)[0]
return
min_impurity = 1e9
best_feature_i, best_theta = -100, -100
# find the best feature_i
for feature_i in range(8):
# only get feature_i
# and sorted by feature_i, we want to get the best theta
features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ]
features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i'])
# if all x are same
if features_i_y_pair[0] == features_i_y_pair[-1]:
continue
thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ]
# find the best theta
for index, theta in enumerate(thetas):
front_data = []
back_data = []
for data in features_i_y_pair:
if data['feature_i'] <= theta:
front_data.append(data)
else:
back_data.append(data)
# get the impurity when (feature_i, theta)
impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data)
if impurity < min_impurity:
min_impurity = impurity
best_feature_i = feature_i
best_theta = theta
if best_feature_i != -100:
self.theta = best_theta
self.feature_i = best_feature_i
else:
# print("NO DECISION")
self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
'''
print("feature_i: ", self.feature_i)
print("Theta: ", self.theta)
print("Value: ", self.value)
for i in self.datas:
print(" ", i)
'''
def expand(self):
left_data = []
for data in self.datas:
if data['x'][self.feature_i] <= self.theta:
left_data.append(data)
right_data = []
for data in self.datas:
if data['x'][self.feature_i] > self.theta:
right_data.append(data)
if len(right_data) == 0 or len(left_data) == 0:
self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
else:
self.left = Node(left_data)
self.right = Node(right_data)
self.left.decision_stump()
self.right.decision_stump()
if self.left.theta != None:
self.left.expand()
if self.right.theta != None:
self.right.expand()
# print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value))
def square_error(target, predict):
return (predict-target) ** 2
if __name__ == '__main__':
train = load_data('hw6_train.dat')
test = load_data('hw6_test.dat')
e_in_logs = []
e_out_logs = []
roots = []
FOREST_SIZE = 2000
for i in range(FOREST_SIZE):
sample_size = int(len(train) / 2)
bagging_train = random.sample(train, k=sample_size)
root = Node(bagging_train)
root.decision_stump()
root.expand()
roots.append(root)
errors = 0
for data in train:
predict_y = root.predict(data)
error = square_error(data['y'], predict_y)
errors += error
print(i)
print(" E_in:", errors/len(test))
e_in_logs.append(errors/len(test))
errors = 0
for data in test:
predict_y = root.predict(data)
error = square_error(data['y'], predict_y)
errors += error
print(i)
print(" E_out:", errors/len(test))
e_out_logs.append(errors/len(test))
plt.scatter(e_in_logs, e_out_logs, c='b')
e_in_logs = []
e_out_logs = []
errors = 0
for data in train:
# predict with aggregation
ans_sum = 0
for i in range(FOREST_SIZE):
predict_y = roots[i].predict(data)
ans_sum += predict_y
predict_y = ans_sum / FOREST_SIZE
error = square_error(data['y'], predict_y)
errors += error
e_in_logs.append(errors/len(train))
errors = 0
for data in test:
# predict with aggregation
ans_sum = 0
for i in range(FOREST_SIZE):
predict_y = roots[i].predict(data)
ans_sum += predict_y
predict_y = ans_sum / FOREST_SIZE
error = square_error(data['y'], predict_y)
errors += error
e_out_logs.append(errors/len(test))
plt.scatter(e_in_logs, e_out_logs, c='r', s=100)
plt.savefig('p11.png')

198
hw6/p12.py Normal file
View File

@ -0,0 +1,198 @@
import random
import matplotlib.pyplot as plt
def load_data(path):
datas = []
with open(path) as fp:
lines = fp.readlines()
for line in lines:
tmp_data = [ 0 for _ in range(8) ]
numbers = line.split()
y = int(numbers[0])
for i in numbers[1:]:
index, value = i.split(':')
index = int(index)
value = float(value)
tmp_data[index-1] = value
x = tmp_data
datas.append({
'x': x,
'y': y,
})
return datas
class Node():
def __init__(self, datas):
self.datas = datas
self.theta = None
self.feature_i = None
self.value = None
self.right = None
self.left = None
def predict(self, data):
if self.value != None:
return self.value
else:
if data['x'][self.feature_i] <= self.theta:
return self.left.predict(data)
else:
return self.right.predict(data)
def decision_stump(self):
def get_impurity(features_i_y_pair):
if len(features_i_y_pair) == 0:
return 0
y_bar = sum([ data['y'] for data in features_i_y_pair ]) / len(features_i_y_pair)
impurity = 0
for data in features_i_y_pair:
impurity += (data['y']-y_bar) ** 2
impurity = impurity / len(features_i_y_pair)
return impurity
different_y = set()
for data in self.datas:
different_y.add(data['y'])
if len(different_y) == 1:
self.value = list(different_y)[0]
return
min_impurity = 1e9
best_feature_i, best_theta = -100, -100
# find the best feature_i
for feature_i in range(8):
# only get feature_i
# and sorted by feature_i, we want to get the best theta
features_i_y_pair = [ {'feature_i': data['x'][feature_i], 'y': data['y'] } for data in self.datas ]
features_i_y_pair = sorted(features_i_y_pair, key=lambda x: x['feature_i'])
# if all x are same
if features_i_y_pair[0] == features_i_y_pair[-1]:
continue
thetas = [ (features_i_y_pair[index]['feature_i']+features_i_y_pair[index-1]['feature_i'])/2 for index in range(1, len(features_i_y_pair)) ]
# find the best theta
for index, theta in enumerate(thetas):
front_data = []
back_data = []
for data in features_i_y_pair:
if data['feature_i'] <= theta:
front_data.append(data)
else:
back_data.append(data)
# get the impurity when (feature_i, theta)
impurity = len(front_data) * get_impurity(front_data) + len(back_data) * get_impurity(back_data)
if impurity < min_impurity:
min_impurity = impurity
best_feature_i = feature_i
best_theta = theta
if best_feature_i != -100:
self.theta = best_theta
self.feature_i = best_feature_i
else:
# print("NO DECISION")
self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
'''
print("feature_i: ", self.feature_i)
print("Theta: ", self.theta)
print("Value: ", self.value)
for i in self.datas:
print(" ", i)
'''
def expand(self):
left_data = []
for data in self.datas:
if data['x'][self.feature_i] <= self.theta:
left_data.append(data)
right_data = []
for data in self.datas:
if data['x'][self.feature_i] > self.theta:
right_data.append(data)
if len(right_data) == 0 or len(left_data) == 0:
self.value = sum([data['y'] for data in self.datas]) / len(self.datas)
else:
self.left = Node(left_data)
self.right = Node(right_data)
self.left.decision_stump()
self.right.decision_stump()
if self.left.theta != None:
self.left.expand()
if self.right.theta != None:
self.right.expand()
# print("theta: {}, feature_i: {}, value:{}".format(self.theta, self.feature_i, self.value))
def square_error(target, predict):
return (predict-target) ** 2
if __name__ == '__main__':
train = load_data('hw6_train.dat')
test = load_data('hw6_test.dat')
e_in_logs = []
e_out_logs = []
roots = []
FOREST_SIZE = 2000
for i in range(FOREST_SIZE):
sample_size = int(len(train) / 2)
bagging_train = random.sample(train, k=sample_size)
root = Node(bagging_train)
root.decision_stump()
root.expand()
roots.append(root)
errors = 0
for data in test:
predict_y = root.predict(data)
error = square_error(data['y'], predict_y)
errors += error
print(i)
print(" E_out:", errors/len(test))
e_out_logs.append(errors/len(test))
plt.plot(list(range(FOREST_SIZE)), e_out_logs, c='b')
e_in_logs = []
e_out_logs = []
for T in range(1, FOREST_SIZE+1):
errors = 0
for data in test:
# predict with aggregation
ans_sum = 0
for i in range(T):
predict_y = roots[i].predict(data)
ans_sum += predict_y
predict_y = ans_sum / T
error = square_error(data['y'], predict_y)
errors += error
e_out_logs.append(errors/len(test))
plt.plot(list(range(FOREST_SIZE)), e_out_logs, c='r')
plt.savefig('p12.png')