feat: complete NavGPT in a single room (without boundary)

This commit is contained in:
Ting-Jun Wang 2024-04-29 01:45:42 +08:00
parent fc0ccb1458
commit 89081b6b21
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354
5 changed files with 174 additions and 22 deletions

View File

@ -10,9 +10,10 @@ from parser import parse_args
from env import REVERIENavBatch
from agent import NavGPTAgent
def build_dataset(args):
def build_dataset(args, data_limit=100):
feat_db = ImageObservationsDB(args.obs_dir, args.obs_summary_dir, args.obj_dir)
print(feat_db)
dataset_class = REVERIENavBatch
@ -25,7 +26,7 @@ def build_dataset(args):
)
val_env = dataset_class(
feat_db, val_instr_data, args.connectivity_dir, args.navigable_dir,
batch_size=args.batch_size, seed=args.seed, name=split,
batch_size=args.batch_size, seed=args.seed, name=split, data_limit=data_limit
) # evaluation using all objects
val_envs[split] = val_env
@ -95,7 +96,7 @@ def valid_from_file(args, val_envs):
def main():
args = parse_args()
val_envs = build_dataset(args)
val_envs = build_dataset(args, data_limit=100)
if args.valid_file is not None:
valid_from_file(args, val_envs)

View File

@ -45,6 +45,11 @@ FINAL_ANSWER_ACTION = "Final Answer:"
EXCEPTION_TOOL_NAME = "_Exception"
MAX_SCRATCHPAD_LENGTH = 7000
FINAL_STOP_POINT = ""
SUCCESS = 0
TEMP_STEPS_COUNTER = 0
STEPS_COUNTER = 0
MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
"Invalid Format: Missing 'Action:' after 'Thought:"
)
@ -63,16 +68,19 @@ class NavGPTOutputParser(AgentOutputParser):
return FORMAT_INSTRUCTIONS
def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
includes_answer = FINAL_ANSWER_ACTION in text
global STEPS_COUNTER
global TEMP_STEPS_COUNTER
global SUCCESS
# includes_answer = FINAL_ANSWER_ACTION in text
regex = (
r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
)
action_match = re.search(regex, text, re.DOTALL)
if action_match:
if includes_answer:
raise OutputParserException(
f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
)
# if includes_answer:
# raise OutputParserException(
# f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
# )
action = action_match.group(1).strip()
action_input = action_match.group(2)
tool_input = action_input.strip(" ")
@ -80,12 +88,36 @@ class NavGPTOutputParser(AgentOutputParser):
if tool_input.startswith("SELECT ") is False:
tool_input = tool_input.strip('"')
return AgentAction(action, tool_input, text)
print("TEXT:", text)
print("ACTION: ", action_input)
print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")
TEMP_STEPS_COUNTER += 1
print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
print(f"STEPS_COUNT = {STEPS_COUNTER}")
print(f"SUCCESS = {SUCCESS}")
if FINAL_STOP_POINT in text:
STEPS_COUNTER += TEMP_STEPS_COUNTER
SUCCESS += 1
TEMP_STEPS_COUNTER = 0
print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
print(f"STEPS_COUNT = {STEPS_COUNTER}")
print(f"SUCCESS = {SUCCESS}")
return AgentFinish(
{"output": action_input}, text
)
return AgentAction(action, tool_input, text)
'''
elif includes_answer:
return AgentFinish(
{"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
)
'''
if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL):
raise OutputParserException(
@ -197,13 +229,14 @@ class NavGPTAgent(BaseAgent):
self.output_parser = NavGPTOutputParser()
self.agent_executor = self.create_vln_agent()
print("AGENT_EXECUTOR: ", type(self.agent_executor))
'''
plan_prompt = PromptTemplate(
template=PLANNER_PROMPT,
input_variables=["instruction"],
)
self.plan_chain = LLMChain(llm=self.llm, prompt=plan_prompt)
'''
def parse_action(self, llm_output: str) -> Tuple[str, str]:
regex = r"(.*?)Final Answer:[\s]*(.*)"
@ -589,6 +622,8 @@ class NavGPTAgent(BaseAgent):
# We will be here
tools = [self.action_maker]
print(tools)
print("TOOL NAME: ", ", ".join([tool.name for tool in tools]))
print("TOOL DESCRIPTION: ", [f"{tool.name}: {tool.description}" for tool in tools])
prompt = PromptTemplate(
template=VLN_GPT4_PROMPT if self.config.llm_model_name == 'gpt-4' else VLN_GPT35_PROMPT,
input_variables=["action_plan", "init_observation", "agent_scratchpad"],
@ -662,6 +697,16 @@ class NavGPTAgent(BaseAgent):
else:
obs = self.env._get_obs()
global FINAL_STOP_POINT
global TEMP_STEPS_COUNTER
FINAL_STOP_POINT = obs[0]['stop']
if TEMP_STEPS_COUNTER != 0:
TEMP_STEPS_COUNTER = 0
print(f"HAVE SET FINAL_STOP_POINT = {FINAL_STOP_POINT}")
print(len(obs))
print(obs[0].keys())
@ -771,3 +816,110 @@ class NavGPTAgent(BaseAgent):
self.traj[i]['llm_observation'].append(observation)
return self.traj
class RandomAgent(BaseAgent):
def __init__(
self,
env: REVERIENavBatch,
config: Namespace):
"""
Initialize the LLM Navigation Agent.
Args:
env: The Matterport3D environment.
config: The configuration.
"""
super().__init__(env)
self.config = config
def init_trajecotry(self, obs: List[dict]):
"""Initialize the trajectory with the given observation."""
# Record the navigation path
self.traj = [{
'instr_id': ob['instr_id'],
'path': [[ob['start']]],
'details': [],
} for ob in obs]
# Record the history of actions taken
def make_equiv_action(self, actions: List[str]) -> str:
"""
Interface between Panoramic view and Egocentric view
Take in the next viewpoint ID and move the agent to that viewpoint
return the turned angle and new observation
"""
def normalize_angle(angle):
while angle > 180:
angle -= 360
while angle <= -180:
angle += 360
return angle
def angle_to_left_right(angle):
return f"left {-angle:.2f}" if angle < 0 else f"right {angle:.2f}"
# Get current agent facing angle
cur_obs = self.env._get_obs()[0]
cur_heading = np.rad2deg(cur_obs['heading'])
# Make the action
new_obs = self.env.step(actions)[0]
new_heading = np.rad2deg(new_obs['heading'])
# Record the trajectory
self.traj[0]['path'].append(self.env.env.sims[0].gmap.bfs_shortest_path(cur_obs['viewpoint'], actions[0])[1:])
# Calculate the turned angle
turned_angle = new_heading - cur_heading
# Generate action description
cur_heading = angle_to_left_right(normalize_angle(cur_heading))
new_heading = angle_to_left_right(normalize_angle(new_heading))
action_description = f'Turn heading direction {turned_angle:.2f} degrees from {cur_heading} to {new_heading}.'
return action_description, new_obs
def rollout(self, reset=True):
if reset: # Reset env
obs = self.env.reset()
else:
obs = self.env._get_obs()
global FINAL_STOP_POINT
global TEMP_STEPS_COUNTER
FINAL_STOP_POINT = obs[0]['stop']
if TEMP_STEPS_COUNTER != 0:
TEMP_STEPS_COUNTER = 0
print(obs[0].keys())
print(obs[0]['obs'])
print(obs[0]['obs_summary'])
print(obs[0]['objects'])
print(obs[0]['instr_id'])
print(obs[0]['scan'])
print(obs[0]['viewpoint'])
print(obs[0]['heading'])
print(obs[0]['elevation'])
print(obs[0]['candidate'])
print(obs[0]['instruction'])
print(obs[0]['gt_path'])
print(obs[0]['path_id'])
print(obs[0]['stop'])
print(obs[0]['start'])
print(obs[0]['target'])
print("==")
# Initialize the trajectory
self.init_trajecotry(obs)
for i, init_ob in enumerate(obs):
navigable = init_ob['candidate']
heading = np.rad2deg(init_ob['heading'])
elevation = np.rad2deg(init_ob['elevation'])
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
output = self.agent_executor(input)
return self.traj

View File

@ -141,7 +141,7 @@ class REVERIENavBatch(object):
def __init__(
self, view_db, instr_data, connectivity_dir, navigable_dir,
batch_size=1, seed=0, name=None,
batch_size=1, seed=0, name=None, data_limit=100
):
self.env = EnvBatch(navigable_dir, feat_db=view_db, batch_size=batch_size)
self.data = instr_data
@ -157,6 +157,8 @@ class REVERIENavBatch(object):
random.seed(self.seed)
random.shuffle(self.data)
self.data = self.data[:data_limit]
self.ix = 0
self._load_nav_graphs()
@ -263,7 +265,7 @@ class REVERIENavBatch(object):
stops = [item['stop'] for item in self.batch]
starts = [item['start'] for item in self.batch]
targets = [item['target'] for item in self.batch]
self.env.newEpisodes(scanIds, viewpointIds, headings, stops, starts, targets)
self.env.newEpisodes(scanIds, starts, headings, stops, starts, targets)
return self._get_obs()
def step(self, next_viewpoint_IDs):

View File

@ -18,7 +18,7 @@ def parse_args():
# parser.add_argument('--llm_model_name', type=str, default='gpt-4', help='llm model name')
# parser.add_argument('--llm_model_name', type=str, default='LlaMA-2-13b', help='llm model name')
parser.add_argument('--batch_size', type=int, default=1)
parser.add_argument('--max_iterations', type=int, default=10)
parser.add_argument('--max_iterations', type=int, default=25)
# General config
parser.add_argument('--iters', type=int, default=10, help='number of iterations to run')

View File

@ -248,11 +248,11 @@ VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an ind
You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.
Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
Explore the environment and don't stay at the original point. Keep Walking! Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
At each step, determine if you've reached the destination.
If yes, stop and output 'Final Answer: Finished!'.
If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
If you find the object but I haven't said you can stop. You cannot say you have finished the task! Keep exploring the nearby area.
continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
Show your reasoning in the Thought section.
Follow the given format and use provided tools.
@ -266,11 +266,8 @@ Instruction: the instruction describing the whole trajectory
Initial Observation: the initial observation of the environment
Thought: you should always think about what to do next and why
Action: the action to take, must be one of the tools [{tool_names}]
Action Input: "Viewpoint ID"
Action Input: "Viewpoint ID" but do not stay in the original viewpoint
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I have reached the destination, I can stop.
Final Answer: Finished!
----
Begin!