From 89081b6b21a7232515b6b62dddb6f0e8a50980b0 Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Mon, 29 Apr 2024 01:45:42 +0800 Subject: [PATCH] feat: complete NavGPT in a single room (without boundary) --- nav_src/NavGPT.py | 7 +- nav_src/agent.py | 166 +++++++++++++++++++++++++++++-- nav_src/env.py | 6 +- nav_src/parser.py | 2 +- nav_src/prompt/planner_prompt.py | 15 ++- 5 files changed, 174 insertions(+), 22 deletions(-) diff --git a/nav_src/NavGPT.py b/nav_src/NavGPT.py index e836632..2570df2 100644 --- a/nav_src/NavGPT.py +++ b/nav_src/NavGPT.py @@ -10,9 +10,10 @@ from parser import parse_args from env import REVERIENavBatch from agent import NavGPTAgent -def build_dataset(args): +def build_dataset(args, data_limit=100): feat_db = ImageObservationsDB(args.obs_dir, args.obs_summary_dir, args.obj_dir) + print(feat_db) dataset_class = REVERIENavBatch @@ -25,7 +26,7 @@ def build_dataset(args): ) val_env = dataset_class( feat_db, val_instr_data, args.connectivity_dir, args.navigable_dir, - batch_size=args.batch_size, seed=args.seed, name=split, + batch_size=args.batch_size, seed=args.seed, name=split, data_limit=data_limit ) # evaluation using all objects val_envs[split] = val_env @@ -95,7 +96,7 @@ def valid_from_file(args, val_envs): def main(): args = parse_args() - val_envs = build_dataset(args) + val_envs = build_dataset(args, data_limit=100) if args.valid_file is not None: valid_from_file(args, val_envs) diff --git a/nav_src/agent.py b/nav_src/agent.py index 63e068c..413b43d 100644 --- a/nav_src/agent.py +++ b/nav_src/agent.py @@ -45,6 +45,11 @@ FINAL_ANSWER_ACTION = "Final Answer:" EXCEPTION_TOOL_NAME = "_Exception" MAX_SCRATCHPAD_LENGTH = 7000 +FINAL_STOP_POINT = "" +SUCCESS = 0 +TEMP_STEPS_COUNTER = 0 +STEPS_COUNTER = 0 + MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = ( "Invalid Format: Missing 'Action:' after 'Thought:" ) @@ -63,16 +68,19 @@ class NavGPTOutputParser(AgentOutputParser): return FORMAT_INSTRUCTIONS def parse(self, text: str) -> Union[AgentAction, AgentFinish]: - includes_answer = FINAL_ANSWER_ACTION in text + global STEPS_COUNTER + global TEMP_STEPS_COUNTER + global SUCCESS + # includes_answer = FINAL_ANSWER_ACTION in text regex = ( r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?" ) action_match = re.search(regex, text, re.DOTALL) if action_match: - if includes_answer: - raise OutputParserException( - f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}" - ) + # if includes_answer: + # raise OutputParserException( + # f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}" + # ) action = action_match.group(1).strip() action_input = action_match.group(2) tool_input = action_input.strip(" ") @@ -80,12 +88,36 @@ class NavGPTOutputParser(AgentOutputParser): if tool_input.startswith("SELECT ") is False: tool_input = tool_input.strip('"') - return AgentAction(action, tool_input, text) + print("TEXT:", text) + print("ACTION: ", action_input) + print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}") + TEMP_STEPS_COUNTER += 1 + print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}") + print(f"STEPS_COUNT = {STEPS_COUNTER}") + print(f"SUCCESS = {SUCCESS}") + + + + if FINAL_STOP_POINT in text: + STEPS_COUNTER += TEMP_STEPS_COUNTER + SUCCESS += 1 + TEMP_STEPS_COUNTER = 0 + print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}") + print(f"STEPS_COUNT = {STEPS_COUNTER}") + print(f"SUCCESS = {SUCCESS}") + + return AgentFinish( + {"output": action_input}, text + ) + + return AgentAction(action, tool_input, text) + ''' elif includes_answer: return AgentFinish( {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text ) + ''' if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL): raise OutputParserException( @@ -197,13 +229,14 @@ class NavGPTAgent(BaseAgent): self.output_parser = NavGPTOutputParser() self.agent_executor = self.create_vln_agent() - print("AGENT_EXECUTOR: ", type(self.agent_executor)) + ''' plan_prompt = PromptTemplate( template=PLANNER_PROMPT, input_variables=["instruction"], ) self.plan_chain = LLMChain(llm=self.llm, prompt=plan_prompt) + ''' def parse_action(self, llm_output: str) -> Tuple[str, str]: regex = r"(.*?)Final Answer:[\s]*(.*)" @@ -589,6 +622,8 @@ class NavGPTAgent(BaseAgent): # We will be here tools = [self.action_maker] print(tools) + print("TOOL NAME: ", ", ".join([tool.name for tool in tools])) + print("TOOL DESCRIPTION: ", [f"{tool.name}: {tool.description}" for tool in tools]) prompt = PromptTemplate( template=VLN_GPT4_PROMPT if self.config.llm_model_name == 'gpt-4' else VLN_GPT35_PROMPT, input_variables=["action_plan", "init_observation", "agent_scratchpad"], @@ -661,6 +696,16 @@ class NavGPTAgent(BaseAgent): obs = self.env.reset() else: obs = self.env._get_obs() + + global FINAL_STOP_POINT + global TEMP_STEPS_COUNTER + + FINAL_STOP_POINT = obs[0]['stop'] + + if TEMP_STEPS_COUNTER != 0: + TEMP_STEPS_COUNTER = 0 + + print(f"HAVE SET FINAL_STOP_POINT = {FINAL_STOP_POINT}") print(len(obs)) @@ -771,3 +816,110 @@ class NavGPTAgent(BaseAgent): self.traj[i]['llm_observation'].append(observation) return self.traj + +class RandomAgent(BaseAgent): + def __init__( + self, + env: REVERIENavBatch, + config: Namespace): + """ + Initialize the LLM Navigation Agent. + + Args: + env: The Matterport3D environment. + config: The configuration. + """ + super().__init__(env) + self.config = config + + + def init_trajecotry(self, obs: List[dict]): + """Initialize the trajectory with the given observation.""" + # Record the navigation path + self.traj = [{ + 'instr_id': ob['instr_id'], + 'path': [[ob['start']]], + 'details': [], + } for ob in obs] + # Record the history of actions taken + + + def make_equiv_action(self, actions: List[str]) -> str: + """ + Interface between Panoramic view and Egocentric view + Take in the next viewpoint ID and move the agent to that viewpoint + return the turned angle and new observation + """ + def normalize_angle(angle): + while angle > 180: + angle -= 360 + while angle <= -180: + angle += 360 + return angle + + def angle_to_left_right(angle): + return f"left {-angle:.2f}" if angle < 0 else f"right {angle:.2f}" + + # Get current agent facing angle + cur_obs = self.env._get_obs()[0] + cur_heading = np.rad2deg(cur_obs['heading']) + # Make the action + new_obs = self.env.step(actions)[0] + new_heading = np.rad2deg(new_obs['heading']) + # Record the trajectory + self.traj[0]['path'].append(self.env.env.sims[0].gmap.bfs_shortest_path(cur_obs['viewpoint'], actions[0])[1:]) + # Calculate the turned angle + turned_angle = new_heading - cur_heading + # Generate action description + cur_heading = angle_to_left_right(normalize_angle(cur_heading)) + new_heading = angle_to_left_right(normalize_angle(new_heading)) + action_description = f'Turn heading direction {turned_angle:.2f} degrees from {cur_heading} to {new_heading}.' + return action_description, new_obs + + def rollout(self, reset=True): + if reset: # Reset env + obs = self.env.reset() + else: + obs = self.env._get_obs() + + global FINAL_STOP_POINT + global TEMP_STEPS_COUNTER + + FINAL_STOP_POINT = obs[0]['stop'] + + if TEMP_STEPS_COUNTER != 0: + TEMP_STEPS_COUNTER = 0 + + print(obs[0].keys()) + print(obs[0]['obs']) + print(obs[0]['obs_summary']) + print(obs[0]['objects']) + print(obs[0]['instr_id']) + print(obs[0]['scan']) + print(obs[0]['viewpoint']) + print(obs[0]['heading']) + print(obs[0]['elevation']) + print(obs[0]['candidate']) + print(obs[0]['instruction']) + print(obs[0]['gt_path']) + print(obs[0]['path_id']) + print(obs[0]['stop']) + print(obs[0]['start']) + print(obs[0]['target']) + + + print("==") + + # Initialize the trajectory + self.init_trajecotry(obs) + + for i, init_ob in enumerate(obs): + + navigable = init_ob['candidate'] + heading = np.rad2deg(init_ob['heading']) + elevation = np.rad2deg(init_ob['elevation']) + orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}' + + output = self.agent_executor(input) + + return self.traj diff --git a/nav_src/env.py b/nav_src/env.py index e6d1966..a31a778 100644 --- a/nav_src/env.py +++ b/nav_src/env.py @@ -141,7 +141,7 @@ class REVERIENavBatch(object): def __init__( self, view_db, instr_data, connectivity_dir, navigable_dir, - batch_size=1, seed=0, name=None, + batch_size=1, seed=0, name=None, data_limit=100 ): self.env = EnvBatch(navigable_dir, feat_db=view_db, batch_size=batch_size) self.data = instr_data @@ -157,6 +157,8 @@ class REVERIENavBatch(object): random.seed(self.seed) random.shuffle(self.data) + self.data = self.data[:data_limit] + self.ix = 0 self._load_nav_graphs() @@ -263,7 +265,7 @@ class REVERIENavBatch(object): stops = [item['stop'] for item in self.batch] starts = [item['start'] for item in self.batch] targets = [item['target'] for item in self.batch] - self.env.newEpisodes(scanIds, viewpointIds, headings, stops, starts, targets) + self.env.newEpisodes(scanIds, starts, headings, stops, starts, targets) return self._get_obs() def step(self, next_viewpoint_IDs): diff --git a/nav_src/parser.py b/nav_src/parser.py index f4428c8..1f26714 100644 --- a/nav_src/parser.py +++ b/nav_src/parser.py @@ -18,7 +18,7 @@ def parse_args(): # parser.add_argument('--llm_model_name', type=str, default='gpt-4', help='llm model name') # parser.add_argument('--llm_model_name', type=str, default='LlaMA-2-13b', help='llm model name') parser.add_argument('--batch_size', type=int, default=1) - parser.add_argument('--max_iterations', type=int, default=10) + parser.add_argument('--max_iterations', type=int, default=25) # General config parser.add_argument('--iters', type=int, default=10, help='number of iterations to run') diff --git a/nav_src/prompt/planner_prompt.py b/nav_src/prompt/planner_prompt.py index 4ba0435..c014fbe 100644 --- a/nav_src/prompt/planner_prompt.py +++ b/nav_src/prompt/planner_prompt.py @@ -248,11 +248,11 @@ VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an ind You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward. -Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer. +Explore the environment and don't stay at the original point. Keep Walking! Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer. -At each step, determine if you've reached the destination. -If yes, stop and output 'Final Answer: Finished!'. -If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool. +If you find the object but I haven't said you can stop. You cannot say you have finished the task! Keep exploring the nearby area. + +continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool. Show your reasoning in the Thought section. Follow the given format and use provided tools. @@ -266,15 +266,12 @@ Instruction: the instruction describing the whole trajectory Initial Observation: the initial observation of the environment Thought: you should always think about what to do next and why Action: the action to take, must be one of the tools [{tool_names}] -Action Input: "Viewpoint ID" +Action Input: "Viewpoint ID" but do not stay in the original viewpoint Observation: the result of the action -... (this Thought/Action/Action Input/Observation can repeat N times) -Thought: I have reached the destination, I can stop. -Final Answer: Finished! ---- Begin! Instruction: {action_plan} Initial Observation: {init_observation} -Thought: I should start navigation according to the instruction, {agent_scratchpad}""" \ No newline at end of file +Thought: I should start navigation according to the instruction, {agent_scratchpad}"""