Compare commits

...

2 Commits

Author SHA1 Message Date
82e2c7e053
feat: random choose the final stop node 2024-05-06 01:10:16 +08:00
68330c5163
feat: evaluate llm success 2024-05-05 23:02:06 +08:00
4 changed files with 59 additions and 123 deletions

View File

@ -1,4 +1,5 @@
"""Agent that interacts with Matterport3D simulator via a hierarchical planning approach.""" """Agent that interacts with Matterport3D simulator via a hierarchical planning approach."""
import random
import json import json
import yaml import yaml
import re import re
@ -49,6 +50,7 @@ FINAL_STOP_POINT = ""
SUCCESS = 0 SUCCESS = 0
TEMP_STEPS_COUNTER = 0 TEMP_STEPS_COUNTER = 0
STEPS_COUNTER = 0 STEPS_COUNTER = 0
NOW_LOCATION = None
MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = ( MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
"Invalid Format: Missing 'Action:' after 'Thought:" "Invalid Format: Missing 'Action:' after 'Thought:"
@ -71,16 +73,17 @@ class NavGPTOutputParser(AgentOutputParser):
global STEPS_COUNTER global STEPS_COUNTER
global TEMP_STEPS_COUNTER global TEMP_STEPS_COUNTER
global SUCCESS global SUCCESS
# includes_answer = FINAL_ANSWER_ACTION in text global NOW_LOCATION
includes_answer = FINAL_ANSWER_ACTION in text
regex = ( regex = (
r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?" r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
) )
action_match = re.search(regex, text, re.DOTALL) action_match = re.search(regex, text, re.DOTALL)
if action_match: if action_match:
# if includes_answer: if includes_answer:
# raise OutputParserException( raise OutputParserException(
# f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}" f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
# ) )
action = action_match.group(1).strip() action = action_match.group(1).strip()
action_input = action_match.group(2) action_input = action_match.group(2)
tool_input = action_input.strip(" ") tool_input = action_input.strip(" ")
@ -92,13 +95,16 @@ class NavGPTOutputParser(AgentOutputParser):
print("ACTION: ", action_input) print("ACTION: ", action_input)
print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}") print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")
TEMP_STEPS_COUNTER += 1 # TEMP_STEPS_COUNTER += 1
print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}") print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
print(f"STEPS_COUNT = {STEPS_COUNTER}") print(f"STEPS_COUNT = {STEPS_COUNTER}")
print(f"SUCCESS = {SUCCESS}") print(f"SUCCESS = {SUCCESS}")
NOW_LOCATION = tool_input
print(f"NOW_LOCATION = {NOW_LOCATION}")
'''
if FINAL_STOP_POINT in text: if FINAL_STOP_POINT in text:
STEPS_COUNTER += TEMP_STEPS_COUNTER STEPS_COUNTER += TEMP_STEPS_COUNTER
SUCCESS += 1 SUCCESS += 1
@ -110,14 +116,22 @@ class NavGPTOutputParser(AgentOutputParser):
return AgentFinish( return AgentFinish(
{"output": action_input}, text {"output": action_input}, text
) )
'''
return AgentAction(action, tool_input, text) return AgentAction(action, tool_input, text)
'''
elif includes_answer: elif includes_answer:
if NOW_LOCATION == FINAL_STOP_POINT:
SUCCESS += 1
print(f"SUCCESS = {SUCCESS}")
else:
print("NOT SUCCESS")
print(f"{NOW_LOCATION}_{type(NOW_LOCATION)}")
print(f"{FINAL_STOP_POINT}_{type(FINAL_STOP_POINT)}")
print(f"SUCCESS = {SUCCESS}")
return AgentFinish( return AgentFinish(
{"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
) )
'''
if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL): if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL):
raise OutputParserException( raise OutputParserException(
@ -698,122 +712,29 @@ class NavGPTAgent(BaseAgent):
obs = self.env._get_obs() obs = self.env._get_obs()
global FINAL_STOP_POINT global FINAL_STOP_POINT
global TEMP_STEPS_COUNTER global SUCCESS
FINAL_STOP_POINT = obs[0]['stop'] FINAL_STOP_POINT = obs[0]['stop']
if TEMP_STEPS_COUNTER != 0:
TEMP_STEPS_COUNTER = 0
print(f"HAVE SET FINAL_STOP_POINT = {FINAL_STOP_POINT}")
print(len(obs))
print(obs[0].keys())
print(obs[0]['obs'])
print(obs[0]['obs_summary'])
print(obs[0]['objects'])
print(obs[0]['instr_id'])
print(obs[0]['scan'])
print(obs[0]['viewpoint'])
print(obs[0]['heading'])
print(obs[0]['elevation'])
print(obs[0]['candidate'])
print(obs[0]['instruction'])
print(obs[0]['gt_path'])
print(obs[0]['path_id'])
print(obs[0]['stop'])
print(obs[0]['start'])
print(obs[0]['target'])
print("==")
# Initialize the trajectory # Initialize the trajectory
self.init_trajecotry(obs) self.init_trajecotry(obs)
# Load the instruction print(obs[0].keys())
# instructions = [ob['instruction'] for ob in obs] print(obs[0]['start'])
targets = [ob['target'] for ob in obs] print(obs[0]['stop'])
print(obs[0]['target'])
candidates = self.env.env.sims[0].getNodesInTheRoom()
candidates.remove(obs[0]['start'])
next_point = random.choice(candidates)
print(next_point)
print(self.config.load_instruction) if next_point == FINAL_STOP_POINT:
print(self.config.load_action_plan) print(" SUCCESS")
SUCCESS += 1
if self.config.load_instruction: print(f"SUCCESS={SUCCESS}")
# action_plans = instructions
action_plans = targets
elif self.config.load_action_plan:
action_plans = [ob['action_plan'] for ob in obs]
else:
action_plans = []
for instruction in instructions:
action_plan = self.plan_chain.run(instruction = instruction)
action_plans.append(action_plan)
print(action_plans)
for i, init_ob in enumerate(obs):
# for our work
# cur_action_plan is "target object with its location"
self.cur_action_plan = action_plans[i]
print("use_tool_chain:", self.config.use_tool_chain)
# Take the first action
if self.config.use_tool_chain: # we will not HERE
first_obs = self.action_maker('')
input = {
'action_plan': self.cur_action_plan,
'init_observation': init_ob['obs_summary'],
'observation': first_obs,
}
else:
# Get current feature
# we are HERE
feature = init_ob['obs']
navigable = init_ob['candidate']
objects = init_ob['objects']
heading = np.rad2deg(init_ob['heading'])
elevation = np.rad2deg(init_ob['elevation'])
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
print("use_relative_angle:", self.config.use_relative_angle)
print("use_relative_angle:", self.config.use_navigable)
if self.config.use_relative_angle: # True
feature = self.modify_heading_angles(heading, feature, navigable, objects)
if self.config.use_navigable: # False
navigable = self.get_navigable_str(heading, elevation, navigable)
if self.config.use_relative_angle:
if self.config.use_navigable:
init_observation = f"\n\tCurrent Viewpoint:\n{feature}\n\tNavigable Viewpoints:\n{navigable}"
else:
init_observation = f"\n\tCurrent Viewpoint:\n{feature}"
else:
if self.config.use_navigable:
init_observation = f"\n\tCurrent Orientation:\n{orientation}\n\tCurrent Viewpoint:\n{feature}\n\tNavigable Viewpoints:\n{navigable}"
else:
init_observation = f"\n\tCurrent Orientation:\n{orientation}\n\tCurrent Viewpoint:\n{feature}"
input = {
'action_plan': self.cur_action_plan, # here will be "object & its location" in our work
'init_observation': init_observation, # 8 direction's observation caption & navigable point & objects
}
output = self.agent_executor(input)
self.traj[i]['llm_output'] = output['output']
self.traj[i]['action_plan'] = output['action_plan']
# extract agent's thought from llm output
intermediate_steps = output['intermediate_steps']
self.traj[i]['llm_thought'] = []
self.traj[i]['llm_observation'] = []
for action, observation in intermediate_steps:
thought = action.log
self.traj[i]['llm_thought'].append(thought)
self.traj[i]['llm_observation'].append(observation)
return self.traj return self.traj

View File

@ -38,15 +38,18 @@ class BaseAgent(object):
if iters is not None: if iters is not None:
# For each time, it will run the first 'iters' iterations. (It was shuffled before) # For each time, it will run the first 'iters' iterations. (It was shuffled before)
for i in range(iters): for i in range(iters):
print(i)
for traj in self.rollout(**kwargs): for traj in self.rollout(**kwargs):
self.loss = 0 self.loss = 0
self.results[traj['instr_id']] = traj self.results[traj['instr_id']] = traj
'''
preds_detail = self.get_results(detailed_output=True) preds_detail = self.get_results(detailed_output=True)
json.dump( json.dump(
preds_detail, preds_detail,
open(os.path.join(self.config.log_dir, 'runtime.json'), 'w'), open(os.path.join(self.config.log_dir, 'runtime.json'), 'w'),
sort_keys=True, indent=4, separators=(',', ': ') sort_keys=True, indent=4, separators=(',', ': ')
) )
'''
else: # Do a full round else: # Do a full round
while True: while True:
for traj in self.rollout(**kwargs): for traj in self.rollout(**kwargs):

View File

@ -162,17 +162,26 @@ class Simulator(object):
self.navigable_dict = {} self.navigable_dict = {}
for start, v in navigable_dict.items(): for start, v in navigable_dict.items():
self.navigable_dict[start] = {} self.navigable_dict[start] = {}
print("BEFORE: ", len(navigable_dict[start])) # print("BEFORE: ", len(navigable_dict[start]))
for to, _v in navigable_dict[start].items(): for to, _v in navigable_dict[start].items():
start_region = self.node_region[scan_ID][start] start_region = self.node_region[scan_ID][start]
to_region = self.node_region[scan_ID][to] to_region = self.node_region[scan_ID][to]
if start_region == to_region: if start_region == to_region:
self.navigable_dict[start][to] = _v self.navigable_dict[start][to] = _v
print(start_region, to_region) # print(start_region, to_region)
print("AFTER: ", len(self.navigable_dict[start])) # print("AFTER: ", len(self.navigable_dict[start]))
# Get candidate # Get candidate
self.getCandidate() self.getCandidate()
def getNodesInTheRoom(self):
ans = []
start_region = self.node_region[self.scan_ID][self.viewpoint_ID]
for node, region_id in self.node_region[self.scan_ID].items():
if region_id == start_region:
ans.append(node)
return ans
def updateGraph(self): def updateGraph(self):
# build graph # build graph

View File

@ -248,11 +248,11 @@ VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an ind
You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward. You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.
Explore the environment and don't stay at the original point. Keep Walking! Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer. Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
If you find the object but I haven't said you can stop. You cannot say you have finished the task! Keep exploring the nearby area. At each step, determine if you've reached the destination.
If yes, stop and output 'Final Answer: Finished!'.
continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool. If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
Show your reasoning in the Thought section. Show your reasoning in the Thought section.
Follow the given format and use provided tools. Follow the given format and use provided tools.
@ -266,8 +266,11 @@ Instruction: the instruction describing the whole trajectory
Initial Observation: the initial observation of the environment Initial Observation: the initial observation of the environment
Thought: you should always think about what to do next and why Thought: you should always think about what to do next and why
Action: the action to take, must be one of the tools [{tool_names}] Action: the action to take, must be one of the tools [{tool_names}]
Action Input: "Viewpoint ID" but do not stay in the original viewpoint Action Input: "Viewpoint ID"
Observation: the result of the action Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I have reached the destination, I can stop.
Final Answer: Finished!
---- ----
Begin! Begin!