feat: init

This commit is contained in:
snsd0805 2024-05-21 18:21:25 +08:00
commit 18b0cb34d6
Signed by: snsd0805
GPG Key ID: 569349933C77A854
8 changed files with 806 additions and 0 deletions

6
README.md Normal file
View File

@ -0,0 +1,6 @@
# Homework 5
## Install Necessary Packages
conda create -n hw5 python=3.11 -y
conda activate hw5
pip install -r requirements.txt

60
custom_env.py Normal file
View File

@ -0,0 +1,60 @@
import gymnasium as gym
import cv2
import numpy as np
def preprocess(img, image_hw=84):
img = img[1:172, :] # MsPacman-specific cropping
img = cv2.resize(img, dsize=(image_hw, image_hw))
img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) / 255.0
return img
class ImageEnv(gym.Wrapper):
def __init__(
self,
env,
skip_frames=4,
stack_frames=4,
image_hw=84,
initial_no_op=50,
**kwargs
):
super(ImageEnv, self).__init__(env, **kwargs)
self.initial_no_op = initial_no_op
self.skip_frames = skip_frames
self.stack_frames = stack_frames
self.image_hw = image_hw
def reset(self):
# Reset the original environment.
state, info = self.env.reset()
# Do nothing for the next `self.initial_no_op` steps
for i in range(self.initial_no_op):
state, reward, terminated, truncated, info = self.env.step(0)
# Convert the frame `state` to Grayscale and resize it
state = preprocess(state, image_hw=self.image_hw)
# The initial observation is simply a copy of the frame `state`
self.stacked_state = np.tile(state, (self.stack_frames, 1, 1)) # [4, 84, 84]
return self.stacked_state, info
def step(self, action):
# We take an action for self.skip_frames steps
rewards = 0
for _ in range(self.skip_frames):
state, reward, terminated, truncated, info = self.env.step(action)
rewards += reward
if terminated or truncated:
break
# Convert the frame `state` to Grayscale and resize it
state = preprocess(state, image_hw=self.image_hw)
# Push the current frame `state` at the end of self.stacked_state
self.stacked_state = np.concatenate((self.stacked_state[1:], state[np.newaxis]), axis=0)
return self.stacked_state, rewards, terminated, truncated, info

385
pacman-intro.ipynb Normal file

File diff suppressed because one or more lines are too long

154
pacman.py Normal file
View File

@ -0,0 +1,154 @@
import os
import time
import argparse
from pathlib import Path
import numpy as np
import gymnasium as gym
import torch
import imageio
from tqdm import tqdm
from rl_algorithm import DQN
from custom_env import ImageEnv
from utils import seed_everything, YOUR_CODE_HERE
import utils
def parse_args():
parser = argparse.ArgumentParser()
# environment hyperparameters
parser.add_argument('--env_name', type=str, default='ALE/MsPacman-v5')
parser.add_argument('--state_dim', type=tuple, default=(4, 84, 84))
parser.add_argument('--image_hw', type=int, default=84, help='The height and width of the image')
parser.add_argument('--num_envs', type=int, default=4)
# DQN hyperparameters
parser.add_argument('--lr', type=float, default=1e-4)
parser.add_argument('--epsilon', type=float, default=0.9)
parser.add_argument('--epsilon_min', type=float, default=0.05)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--warmup_steps', type=int, default=5000)
parser.add_argument('--buffer_size', type=int, default=int(1e5))
parser.add_argument('--target_update_interval', type=int, default=10000)
# training hyperparameters
parser.add_argument('--max_steps', type=int, default=int(2.5e5))
parser.add_argument('--eval_interval', type=int, default=10000)
# others
parser.add_argument('--save_root', type=Path, default='./submissions')
parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
help="the name of this experiment")
# evaluation
parser.add_argument('--eval', action="store_true", help='evaluate the model')
parser.add_argument('--eval_model_path', type=str, default=None, help='the path of the model to evaluate')
return parser.parse_args()
def validation(agent, num_evals=5):
eval_env = gym.make('ALE/MsPacman-v5')
eval_env = ImageEnv(eval_env)
scores = 0
for i in range(num_evals):
(state, _), done = eval_env.reset(), False
while not done:
"*** YOUR CODE HERE ***"
utils.raiseNotDefined()
# do action from your agent
action = YOUR_CODE_HERE
# get your action feedback from environment
next_state, reward, terminated, truncated, info = YOUR_CODE_HERE
state = next_state
scores += reward
done = terminated or truncated
return np.round(scores / num_evals, 4)
def train(agent, env):
history = {'Step': [], 'AvgScore': []}
(state, _) = env.reset()
for _ in tqdm(range(args.max_steps)):
action = agent.act(state)
next_state, reward, terminated, truncated, _ = env.step(action)
result = agent.process((state, action, reward, next_state, terminated)) # You can track q-losses over training from `result` variable.
state = next_state
if terminated or truncated:
state, _ = env.reset()
if agent.total_steps % args.eval_interval == 0:
avg_score = validation(agent)
history['Step'].append(agent.total_steps)
history['AvgScore'].append(avg_score)
# log info to plot your figure
"*** YOUR CODE HERE ***"
# save model
torch.save(agent.network.state_dict(), save_dir / 'pacma_dqn.pt')
print("Step: {}, AvgScore: {}, ValueLoss: {}".format(agent.total_steps, avg_score, result["value_loss"]))
def evaluate(agent, eval_env, capture_frames=True):
seed_everything(0, eval_env) # don't modify
# load the model
if agent is None:
action_dim = eval_env.action_space.n
state_dim = (args.num_envs, args.image_hw, args.image_hw)
agent = DQN(state_dim=state_dim, action_dim=action_dim)
agent.network.load_state_dict(torch.load(args.eval_model_path))
(state, _), done = eval_env.reset(), False
scores = 0
# Record the frames
if capture_frames:
writer = imageio.get_writer(save_dir / 'mspacman.mp4', fps=10)
while not done:
if capture_frames:
writer.append_data(eval_env.render())
else:
eval_env.render()
action = agent.act(state, training=False)
next_state, reward, terminated, truncated, info = eval_env.step(action)
state = next_state
scores += reward
done = terminated or truncated
if capture_frames:
writer.close()
print("The score of the agent: ", scores)
def main():
env = gym.make(args.env_name)
env = ImageEnv(env, stack_frames=args.num_envs, image_hw=args.image_hw)
action_dim = env.action_space.n
state_dim = (args.num_envs, args.image_hw, args.image_hw)
agent = DQN(state_dim=state_dim, action_dim=action_dim)
# train
train(agent, env)
# evaluate
eval_env = gym.make(args.env_name, render_mode='rgb_array')
eval_env = ImageEnv(eval_env, stack_frames=args.num_envs, image_hw=args.image_hw)
evaluate(agent, eval_env)
if __name__ == "__main__":
args = parse_args()
# save_dir = args.save_root / f"{args.env_name.replace('/', '-')}__{args.exp_name}__{int(time.time())}"
save_dir = args.save_root
if not save_dir.exists():
save_dir.mkdir(parents=True)
if args.eval:
eval_env = gym.make(args.env_name, render_mode='rgb_array')
eval_env = ImageEnv(eval_env, stack_frames=args.num_envs, image_hw=args.image_hw)
evaluate(agent=None, eval_env=eval_env, capture_frames=False)
else:
main()

14
requirements.txt Normal file
View File

@ -0,0 +1,14 @@
opencv-python==4.8.1.78
swig==4.2.1
gymnasium==0.29.1
gymnasium[atari, accept-rom-license]
numpy==1.26.4
matplotlib==3.8.4
imageio-ffmpeg
imageio==2.34.1
torch
tqdm
# # for CUDA 11.3 torch on Linux
# --index-url https://download.pytorch.org/whl/cu113; sys_platform == "linux"
# torch; sys_platform == "linux"

161
rl_algorithm.py Normal file
View File

@ -0,0 +1,161 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from utils import YOUR_CODE_HERE
import utils
class PacmanActionCNN(nn.Module):
def __init__(self, state_dim, action_dim):
super(PacmanActionCNN, self).__init__()
# build your own CNN model
"*** YOUR CODE HERE ***"
utils.raiseNotDefined()
# this is just an example, you can modify this.
self.conv1 = nn.Conv2d(state_dim, 16, kernel_size=8, stride=4)
def forward(self, x):
x = F.relu(self.conv1(x))
"*** YOUR CODE HERE ***"
utils.raiseNotDefined()
return x
class ReplayBuffer:
# referenced [TD3 official implementation](https://github.com/sfujim/TD3/blob/master/utils.py#L5).
def __init__(self, state_dim, action_dim, max_size=int(1e5)):
self.states = np.zeros((max_size, *state_dim), dtype=np.float32)
self.actions = np.zeros((max_size, *action_dim), dtype=np.int64)
self.rewards = np.zeros((max_size, 1), dtype=np.float32)
self.next_states = np.zeros((max_size, *state_dim), dtype=np.float32)
self.terminated = np.zeros((max_size, 1), dtype=np.float32)
self.ptr = 0
self.size = 0
self.max_size = max_size
def update(self, state, action, reward, next_state, terminated):
self.states[self.ptr] = state
self.actions[self.ptr] = action
self.rewards[self.ptr] = reward
self.next_states[self.ptr] = next_state
self.terminated[self.ptr] = terminated
self.ptr = (self.ptr + 1) % self.max_size
self.size = min(self.size + 1, self.max_size)
def sample(self, batch_size):
ind = np.random.randint(0, self.size, batch_size)
return (
torch.FloatTensor(self.states[ind]),
torch.FloatTensor(self.actions[ind]),
torch.FloatTensor(self.rewards[ind]),
torch.FloatTensor(self.next_states[ind]),
torch.FloatTensor(self.terminated[ind]),
)
class DQN:
def __init__(
self,
state_dim,
action_dim,
lr=1e-4,
epsilon=0.9,
epsilon_min=0.05,
gamma=0.99,
batch_size=64,
warmup_steps=5000,
buffer_size=int(1e5),
target_update_interval=10000,
):
"""
DQN agent has four methods.
- __init__() as usual
- act() takes as input one state of np.ndarray and output actions by following epsilon-greedy policy.
- process() method takes one transition as input and define what the agent do for each step.
- learn() method samples a mini-batch from replay buffer and train q-network
"""
self.action_dim = action_dim
self.epsilon = epsilon
self.gamma = gamma
self.batch_size = batch_size
self.warmup_steps = warmup_steps
self.target_update_interval = target_update_interval
self.network = PacmanActionCNN(state_dim[0], action_dim)
self.target_network = PacmanActionCNN(state_dim[0], action_dim)
self.target_network.load_state_dict(self.network.state_dict())
self.optimizer = torch.optim.RMSprop(self.network.parameters(), lr)
self.buffer = ReplayBuffer(state_dim, (1, ), buffer_size)
self.device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
self.network.to(self.device)
self.target_network.to(self.device)
self.total_steps = 0
self.epsilon_decay = (epsilon - epsilon_min) / 1e6
@torch.no_grad()
def act(self, x, training=True):
self.network.train(training)
if training and ((np.random.rand() < self.epsilon) or (self.total_steps < self.warmup_steps)):
# Random action
action = np.random.randint(0, self.action_dim)
else:
# output actions by following epsilon-greedy policy
x = torch.from_numpy(x).float().unsqueeze(0).to(self.device)
"*** YOUR CODE HERE ***"
utils.raiseNotDefined()
# get q-values from network
q_value = YOUR_CODE_HERE
# get action with maximum q-value
action = YOUR_CODE_HERE
return action
def learn(self):
"*** YOUR CODE HERE ***"
utils.raiseNotDefined()
# sample a mini-batch from replay buffer
state, action, reward, next_state, terminated = map(lambda x: x.to(self.device), self.buffer.sample(self.batch_size))
# get q-values from network
next_q = YOUR_CODE_HERE
# td_target: if terminated, only reward, otherwise reward + gamma * max(next_q)
td_target = YOUR_CODE_HERE
# compute loss with td_target and q-values
loss = YOUR_CODE_HERE
# initialize optimizer
"self.optimizer.YOUR_CODE_HERE"
# backpropagation
YOUR_CODE_HERE
# update network
"self.optimizer.YOUR_CODE_HERE"
return {YOUR_CODE_HERE} # return dictionary for logging
def process(self, transition):
"*** YOUR CODE HERE ***"
utils.raiseNotDefined()
result = {}
self.total_steps += 1
# update replay buffer
"self.buffer.YOUR_CODE_HERE"
if self.total_steps > self.warmup_steps:
result = self.learn()
if self.total_steps % self.target_update_interval == 0:
# update target networ
"self.target_network.YOUR_CODE_HERE"
self.epsilon -= self.epsilon_decay
return result

BIN
submissions/.DS_Store vendored Normal file

Binary file not shown.

26
utils.py Normal file
View File

@ -0,0 +1,26 @@
import inspect
import sys
import os
import torch
import random
import numpy as np
def raiseNotDefined():
filename = inspect.stack()[1][1]
line = inspect.stack()[1][2]
method = inspect.stack()[1][3]
print(f"*** Method not implemented: {method} at line {line} of {filename} ***")
sys.exit()
def seed_everything(seed, env):
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
env.seed(seed)
YOUR_CODE_HERE = "*** YOUR CODE HERE ***"