feat: init
This commit is contained in:
commit
18b0cb34d6
6
README.md
Normal file
6
README.md
Normal file
@ -0,0 +1,6 @@
|
||||
# Homework 5
|
||||
|
||||
## Install Necessary Packages
|
||||
conda create -n hw5 python=3.11 -y
|
||||
conda activate hw5
|
||||
pip install -r requirements.txt
|
||||
60
custom_env.py
Normal file
60
custom_env.py
Normal file
@ -0,0 +1,60 @@
|
||||
import gymnasium as gym
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
def preprocess(img, image_hw=84):
|
||||
img = img[1:172, :] # MsPacman-specific cropping
|
||||
img = cv2.resize(img, dsize=(image_hw, image_hw))
|
||||
|
||||
img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) / 255.0
|
||||
return img
|
||||
|
||||
class ImageEnv(gym.Wrapper):
|
||||
def __init__(
|
||||
self,
|
||||
env,
|
||||
skip_frames=4,
|
||||
stack_frames=4,
|
||||
image_hw=84,
|
||||
initial_no_op=50,
|
||||
**kwargs
|
||||
):
|
||||
super(ImageEnv, self).__init__(env, **kwargs)
|
||||
self.initial_no_op = initial_no_op
|
||||
self.skip_frames = skip_frames
|
||||
self.stack_frames = stack_frames
|
||||
self.image_hw = image_hw
|
||||
|
||||
def reset(self):
|
||||
# Reset the original environment.
|
||||
state, info = self.env.reset()
|
||||
|
||||
# Do nothing for the next `self.initial_no_op` steps
|
||||
for i in range(self.initial_no_op):
|
||||
state, reward, terminated, truncated, info = self.env.step(0)
|
||||
|
||||
# Convert the frame `state` to Grayscale and resize it
|
||||
state = preprocess(state, image_hw=self.image_hw)
|
||||
|
||||
# The initial observation is simply a copy of the frame `state`
|
||||
self.stacked_state = np.tile(state, (self.stack_frames, 1, 1)) # [4, 84, 84]
|
||||
|
||||
return self.stacked_state, info
|
||||
|
||||
|
||||
def step(self, action):
|
||||
# We take an action for self.skip_frames steps
|
||||
rewards = 0
|
||||
for _ in range(self.skip_frames):
|
||||
state, reward, terminated, truncated, info = self.env.step(action)
|
||||
rewards += reward
|
||||
if terminated or truncated:
|
||||
break
|
||||
|
||||
# Convert the frame `state` to Grayscale and resize it
|
||||
state = preprocess(state, image_hw=self.image_hw)
|
||||
|
||||
# Push the current frame `state` at the end of self.stacked_state
|
||||
self.stacked_state = np.concatenate((self.stacked_state[1:], state[np.newaxis]), axis=0)
|
||||
|
||||
return self.stacked_state, rewards, terminated, truncated, info
|
||||
385
pacman-intro.ipynb
Normal file
385
pacman-intro.ipynb
Normal file
File diff suppressed because one or more lines are too long
154
pacman.py
Normal file
154
pacman.py
Normal file
@ -0,0 +1,154 @@
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import gymnasium as gym
|
||||
import torch
|
||||
import imageio
|
||||
from tqdm import tqdm
|
||||
|
||||
from rl_algorithm import DQN
|
||||
from custom_env import ImageEnv
|
||||
from utils import seed_everything, YOUR_CODE_HERE
|
||||
import utils
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
# environment hyperparameters
|
||||
parser.add_argument('--env_name', type=str, default='ALE/MsPacman-v5')
|
||||
parser.add_argument('--state_dim', type=tuple, default=(4, 84, 84))
|
||||
parser.add_argument('--image_hw', type=int, default=84, help='The height and width of the image')
|
||||
parser.add_argument('--num_envs', type=int, default=4)
|
||||
# DQN hyperparameters
|
||||
parser.add_argument('--lr', type=float, default=1e-4)
|
||||
parser.add_argument('--epsilon', type=float, default=0.9)
|
||||
parser.add_argument('--epsilon_min', type=float, default=0.05)
|
||||
parser.add_argument('--gamma', type=float, default=0.99)
|
||||
parser.add_argument('--batch_size', type=int, default=64)
|
||||
parser.add_argument('--warmup_steps', type=int, default=5000)
|
||||
parser.add_argument('--buffer_size', type=int, default=int(1e5))
|
||||
parser.add_argument('--target_update_interval', type=int, default=10000)
|
||||
# training hyperparameters
|
||||
parser.add_argument('--max_steps', type=int, default=int(2.5e5))
|
||||
parser.add_argument('--eval_interval', type=int, default=10000)
|
||||
# others
|
||||
parser.add_argument('--save_root', type=Path, default='./submissions')
|
||||
parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
|
||||
help="the name of this experiment")
|
||||
# evaluation
|
||||
parser.add_argument('--eval', action="store_true", help='evaluate the model')
|
||||
parser.add_argument('--eval_model_path', type=str, default=None, help='the path of the model to evaluate')
|
||||
return parser.parse_args()
|
||||
|
||||
def validation(agent, num_evals=5):
|
||||
eval_env = gym.make('ALE/MsPacman-v5')
|
||||
eval_env = ImageEnv(eval_env)
|
||||
|
||||
scores = 0
|
||||
for i in range(num_evals):
|
||||
(state, _), done = eval_env.reset(), False
|
||||
while not done:
|
||||
"*** YOUR CODE HERE ***"
|
||||
utils.raiseNotDefined()
|
||||
# do action from your agent
|
||||
action = YOUR_CODE_HERE
|
||||
# get your action feedback from environment
|
||||
next_state, reward, terminated, truncated, info = YOUR_CODE_HERE
|
||||
|
||||
state = next_state
|
||||
scores += reward
|
||||
done = terminated or truncated
|
||||
return np.round(scores / num_evals, 4)
|
||||
|
||||
def train(agent, env):
|
||||
history = {'Step': [], 'AvgScore': []}
|
||||
|
||||
(state, _) = env.reset()
|
||||
|
||||
for _ in tqdm(range(args.max_steps)):
|
||||
|
||||
action = agent.act(state)
|
||||
next_state, reward, terminated, truncated, _ = env.step(action)
|
||||
result = agent.process((state, action, reward, next_state, terminated)) # You can track q-losses over training from `result` variable.
|
||||
|
||||
state = next_state
|
||||
if terminated or truncated:
|
||||
state, _ = env.reset()
|
||||
|
||||
if agent.total_steps % args.eval_interval == 0:
|
||||
avg_score = validation(agent)
|
||||
history['Step'].append(agent.total_steps)
|
||||
history['AvgScore'].append(avg_score)
|
||||
|
||||
# log info to plot your figure
|
||||
"*** YOUR CODE HERE ***"
|
||||
|
||||
# save model
|
||||
torch.save(agent.network.state_dict(), save_dir / 'pacma_dqn.pt')
|
||||
print("Step: {}, AvgScore: {}, ValueLoss: {}".format(agent.total_steps, avg_score, result["value_loss"]))
|
||||
|
||||
def evaluate(agent, eval_env, capture_frames=True):
|
||||
seed_everything(0, eval_env) # don't modify
|
||||
|
||||
# load the model
|
||||
if agent is None:
|
||||
action_dim = eval_env.action_space.n
|
||||
state_dim = (args.num_envs, args.image_hw, args.image_hw)
|
||||
agent = DQN(state_dim=state_dim, action_dim=action_dim)
|
||||
agent.network.load_state_dict(torch.load(args.eval_model_path))
|
||||
|
||||
(state, _), done = eval_env.reset(), False
|
||||
|
||||
scores = 0
|
||||
# Record the frames
|
||||
if capture_frames:
|
||||
writer = imageio.get_writer(save_dir / 'mspacman.mp4', fps=10)
|
||||
|
||||
while not done:
|
||||
if capture_frames:
|
||||
writer.append_data(eval_env.render())
|
||||
else:
|
||||
eval_env.render()
|
||||
|
||||
action = agent.act(state, training=False)
|
||||
next_state, reward, terminated, truncated, info = eval_env.step(action)
|
||||
state = next_state
|
||||
scores += reward
|
||||
done = terminated or truncated
|
||||
if capture_frames:
|
||||
writer.close()
|
||||
print("The score of the agent: ", scores)
|
||||
|
||||
def main():
|
||||
env = gym.make(args.env_name)
|
||||
env = ImageEnv(env, stack_frames=args.num_envs, image_hw=args.image_hw)
|
||||
|
||||
action_dim = env.action_space.n
|
||||
state_dim = (args.num_envs, args.image_hw, args.image_hw)
|
||||
agent = DQN(state_dim=state_dim, action_dim=action_dim)
|
||||
|
||||
# train
|
||||
train(agent, env)
|
||||
|
||||
# evaluate
|
||||
eval_env = gym.make(args.env_name, render_mode='rgb_array')
|
||||
eval_env = ImageEnv(eval_env, stack_frames=args.num_envs, image_hw=args.image_hw)
|
||||
evaluate(agent, eval_env)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
# save_dir = args.save_root / f"{args.env_name.replace('/', '-')}__{args.exp_name}__{int(time.time())}"
|
||||
save_dir = args.save_root
|
||||
if not save_dir.exists():
|
||||
save_dir.mkdir(parents=True)
|
||||
|
||||
if args.eval:
|
||||
eval_env = gym.make(args.env_name, render_mode='rgb_array')
|
||||
eval_env = ImageEnv(eval_env, stack_frames=args.num_envs, image_hw=args.image_hw)
|
||||
evaluate(agent=None, eval_env=eval_env, capture_frames=False)
|
||||
else:
|
||||
main()
|
||||
|
||||
14
requirements.txt
Normal file
14
requirements.txt
Normal file
@ -0,0 +1,14 @@
|
||||
opencv-python==4.8.1.78
|
||||
swig==4.2.1
|
||||
gymnasium==0.29.1
|
||||
gymnasium[atari, accept-rom-license]
|
||||
numpy==1.26.4
|
||||
matplotlib==3.8.4
|
||||
imageio-ffmpeg
|
||||
imageio==2.34.1
|
||||
torch
|
||||
tqdm
|
||||
|
||||
# # for CUDA 11.3 torch on Linux
|
||||
# --index-url https://download.pytorch.org/whl/cu113; sys_platform == "linux"
|
||||
# torch; sys_platform == "linux"
|
||||
161
rl_algorithm.py
Normal file
161
rl_algorithm.py
Normal file
@ -0,0 +1,161 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
from utils import YOUR_CODE_HERE
|
||||
import utils
|
||||
|
||||
class PacmanActionCNN(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(PacmanActionCNN, self).__init__()
|
||||
# build your own CNN model
|
||||
"*** YOUR CODE HERE ***"
|
||||
utils.raiseNotDefined()
|
||||
# this is just an example, you can modify this.
|
||||
self.conv1 = nn.Conv2d(state_dim, 16, kernel_size=8, stride=4)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(self.conv1(x))
|
||||
|
||||
"*** YOUR CODE HERE ***"
|
||||
utils.raiseNotDefined()
|
||||
|
||||
return x
|
||||
|
||||
class ReplayBuffer:
|
||||
# referenced [TD3 official implementation](https://github.com/sfujim/TD3/blob/master/utils.py#L5).
|
||||
def __init__(self, state_dim, action_dim, max_size=int(1e5)):
|
||||
self.states = np.zeros((max_size, *state_dim), dtype=np.float32)
|
||||
self.actions = np.zeros((max_size, *action_dim), dtype=np.int64)
|
||||
self.rewards = np.zeros((max_size, 1), dtype=np.float32)
|
||||
self.next_states = np.zeros((max_size, *state_dim), dtype=np.float32)
|
||||
self.terminated = np.zeros((max_size, 1), dtype=np.float32)
|
||||
|
||||
self.ptr = 0
|
||||
self.size = 0
|
||||
self.max_size = max_size
|
||||
|
||||
def update(self, state, action, reward, next_state, terminated):
|
||||
self.states[self.ptr] = state
|
||||
self.actions[self.ptr] = action
|
||||
self.rewards[self.ptr] = reward
|
||||
self.next_states[self.ptr] = next_state
|
||||
self.terminated[self.ptr] = terminated
|
||||
|
||||
self.ptr = (self.ptr + 1) % self.max_size
|
||||
self.size = min(self.size + 1, self.max_size)
|
||||
|
||||
def sample(self, batch_size):
|
||||
ind = np.random.randint(0, self.size, batch_size)
|
||||
return (
|
||||
torch.FloatTensor(self.states[ind]),
|
||||
torch.FloatTensor(self.actions[ind]),
|
||||
torch.FloatTensor(self.rewards[ind]),
|
||||
torch.FloatTensor(self.next_states[ind]),
|
||||
torch.FloatTensor(self.terminated[ind]),
|
||||
)
|
||||
|
||||
class DQN:
|
||||
def __init__(
|
||||
self,
|
||||
state_dim,
|
||||
action_dim,
|
||||
lr=1e-4,
|
||||
epsilon=0.9,
|
||||
epsilon_min=0.05,
|
||||
gamma=0.99,
|
||||
batch_size=64,
|
||||
warmup_steps=5000,
|
||||
buffer_size=int(1e5),
|
||||
target_update_interval=10000,
|
||||
):
|
||||
"""
|
||||
DQN agent has four methods.
|
||||
|
||||
- __init__() as usual
|
||||
- act() takes as input one state of np.ndarray and output actions by following epsilon-greedy policy.
|
||||
- process() method takes one transition as input and define what the agent do for each step.
|
||||
- learn() method samples a mini-batch from replay buffer and train q-network
|
||||
"""
|
||||
self.action_dim = action_dim
|
||||
self.epsilon = epsilon
|
||||
self.gamma = gamma
|
||||
self.batch_size = batch_size
|
||||
self.warmup_steps = warmup_steps
|
||||
self.target_update_interval = target_update_interval
|
||||
|
||||
self.network = PacmanActionCNN(state_dim[0], action_dim)
|
||||
self.target_network = PacmanActionCNN(state_dim[0], action_dim)
|
||||
self.target_network.load_state_dict(self.network.state_dict())
|
||||
self.optimizer = torch.optim.RMSprop(self.network.parameters(), lr)
|
||||
|
||||
self.buffer = ReplayBuffer(state_dim, (1, ), buffer_size)
|
||||
self.device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
|
||||
self.network.to(self.device)
|
||||
self.target_network.to(self.device)
|
||||
|
||||
self.total_steps = 0
|
||||
self.epsilon_decay = (epsilon - epsilon_min) / 1e6
|
||||
|
||||
@torch.no_grad()
|
||||
def act(self, x, training=True):
|
||||
self.network.train(training)
|
||||
if training and ((np.random.rand() < self.epsilon) or (self.total_steps < self.warmup_steps)):
|
||||
# Random action
|
||||
action = np.random.randint(0, self.action_dim)
|
||||
else:
|
||||
# output actions by following epsilon-greedy policy
|
||||
x = torch.from_numpy(x).float().unsqueeze(0).to(self.device)
|
||||
|
||||
"*** YOUR CODE HERE ***"
|
||||
utils.raiseNotDefined()
|
||||
# get q-values from network
|
||||
q_value = YOUR_CODE_HERE
|
||||
# get action with maximum q-value
|
||||
action = YOUR_CODE_HERE
|
||||
|
||||
return action
|
||||
|
||||
def learn(self):
|
||||
"*** YOUR CODE HERE ***"
|
||||
utils.raiseNotDefined()
|
||||
|
||||
# sample a mini-batch from replay buffer
|
||||
state, action, reward, next_state, terminated = map(lambda x: x.to(self.device), self.buffer.sample(self.batch_size))
|
||||
|
||||
# get q-values from network
|
||||
next_q = YOUR_CODE_HERE
|
||||
# td_target: if terminated, only reward, otherwise reward + gamma * max(next_q)
|
||||
td_target = YOUR_CODE_HERE
|
||||
# compute loss with td_target and q-values
|
||||
loss = YOUR_CODE_HERE
|
||||
|
||||
# initialize optimizer
|
||||
"self.optimizer.YOUR_CODE_HERE"
|
||||
# backpropagation
|
||||
YOUR_CODE_HERE
|
||||
# update network
|
||||
"self.optimizer.YOUR_CODE_HERE"
|
||||
|
||||
return {YOUR_CODE_HERE} # return dictionary for logging
|
||||
|
||||
def process(self, transition):
|
||||
"*** YOUR CODE HERE ***"
|
||||
utils.raiseNotDefined()
|
||||
|
||||
result = {}
|
||||
self.total_steps += 1
|
||||
|
||||
# update replay buffer
|
||||
"self.buffer.YOUR_CODE_HERE"
|
||||
|
||||
if self.total_steps > self.warmup_steps:
|
||||
result = self.learn()
|
||||
|
||||
if self.total_steps % self.target_update_interval == 0:
|
||||
# update target networ
|
||||
"self.target_network.YOUR_CODE_HERE"
|
||||
|
||||
self.epsilon -= self.epsilon_decay
|
||||
return result
|
||||
BIN
submissions/.DS_Store
vendored
Normal file
BIN
submissions/.DS_Store
vendored
Normal file
Binary file not shown.
26
utils.py
Normal file
26
utils.py
Normal file
@ -0,0 +1,26 @@
|
||||
import inspect
|
||||
import sys
|
||||
import os
|
||||
|
||||
import torch
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
def raiseNotDefined():
|
||||
filename = inspect.stack()[1][1]
|
||||
line = inspect.stack()[1][2]
|
||||
method = inspect.stack()[1][3]
|
||||
|
||||
print(f"*** Method not implemented: {method} at line {line} of {filename} ***")
|
||||
sys.exit()
|
||||
|
||||
def seed_everything(seed, env):
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
env.seed(seed)
|
||||
|
||||
YOUR_CODE_HERE = "*** YOUR CODE HERE ***"
|
||||
Loading…
Reference in New Issue
Block a user