feat: random choose the final stop node

feat: evaluate llm success
2024-05-06 01:10:16 +08:00 · 2024-05-05 23:02:06 +08:00
4 changed files with 59 additions and 123 deletions
--- a/nav_src/agent.py
+++ b/nav_src/agent.py
@ -1,4 +1,5 @@
 """Agent that interacts with Matterport3D simulator via a hierarchical planning approach."""
 import random
 import json
 import yaml
 import re
@ -49,6 +50,7 @@ FINAL_STOP_POINT = ""
 SUCCESS = 0
 TEMP_STEPS_COUNTER = 0
 STEPS_COUNTER = 0
 NOW_LOCATION = None
 MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
    "Invalid Format: Missing 'Action:' after 'Thought:"
@ -71,16 +73,17 @@ class NavGPTOutputParser(AgentOutputParser):
        global STEPS_COUNTER
        global TEMP_STEPS_COUNTER
        global SUCCESS
-        # includes_answer = FINAL_ANSWER_ACTION in text
+        global NOW_LOCATION
        includes_answer = FINAL_ANSWER_ACTION in text
        regex = (
            r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
        )
        action_match = re.search(regex, text, re.DOTALL)
        if action_match:
-            # if includes_answer:
+            if includes_answer:
-            #     raise OutputParserException(
+                raise OutputParserException(
-            #         f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
+                    f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
-            #     )
+                )
            action = action_match.group(1).strip()
            action_input = action_match.group(2)
            tool_input = action_input.strip(" ")
@ -92,13 +95,16 @@ class NavGPTOutputParser(AgentOutputParser):
            print("ACTION: ", action_input)
            print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")
-            TEMP_STEPS_COUNTER += 1
+            # TEMP_STEPS_COUNTER += 1
            print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
            print(f"STEPS_COUNT = {STEPS_COUNTER}")
            print(f"SUCCESS = {SUCCESS}")
            NOW_LOCATION = tool_input
            print(f"NOW_LOCATION = {NOW_LOCATION}")
            '''
            if FINAL_STOP_POINT in text:
                STEPS_COUNTER += TEMP_STEPS_COUNTER
                SUCCESS += 1
@ -110,14 +116,22 @@ class NavGPTOutputParser(AgentOutputParser):
                return AgentFinish(
                    {"output": action_input}, text
                )
            '''
            return AgentAction(action, tool_input, text)
        '''
        elif includes_answer:
            if NOW_LOCATION == FINAL_STOP_POINT:
                SUCCESS += 1
                print(f"SUCCESS = {SUCCESS}")
            else:
                print("NOT SUCCESS")
                print(f"{NOW_LOCATION}_{type(NOW_LOCATION)}")
                print(f"{FINAL_STOP_POINT}_{type(FINAL_STOP_POINT)}")
                print(f"SUCCESS = {SUCCESS}")
            return AgentFinish(
                {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
            )
        '''
        if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL):
            raise OutputParserException(
@ -698,122 +712,29 @@ class NavGPTAgent(BaseAgent):
            obs = self.env._get_obs()
        global FINAL_STOP_POINT
-        global TEMP_STEPS_COUNTER
+        global SUCCESS
        FINAL_STOP_POINT = obs[0]['stop']
        if TEMP_STEPS_COUNTER != 0:
            TEMP_STEPS_COUNTER = 0
        print(f"HAVE SET FINAL_STOP_POINT = {FINAL_STOP_POINT}")
        print(len(obs))
        print(obs[0].keys())
        print(obs[0]['obs'])
        print(obs[0]['obs_summary'])
        print(obs[0]['objects'])
        print(obs[0]['instr_id'])
        print(obs[0]['scan'])
        print(obs[0]['viewpoint'])
        print(obs[0]['heading'])
        print(obs[0]['elevation'])
        print(obs[0]['candidate'])
        print(obs[0]['instruction'])
        print(obs[0]['gt_path'])
        print(obs[0]['path_id'])
        print(obs[0]['stop'])
        print(obs[0]['start'])
        print(obs[0]['target'])
        print("==")
        # Initialize the trajectory
        self.init_trajecotry(obs)
-        # Load the instruction
+        print(obs[0].keys())
-        # instructions = [ob['instruction'] for ob in obs]
+        print(obs[0]['start'])
-        targets = [ob['target'] for ob in obs]
+        print(obs[0]['stop'])
        print(obs[0]['target'])
        candidates = self.env.env.sims[0].getNodesInTheRoom()
        candidates.remove(obs[0]['start'])
        next_point = random.choice(candidates)
        print(next_point)
-        print(self.config.load_instruction)
+        if next_point == FINAL_STOP_POINT:
-        print(self.config.load_action_plan)
+            print("    SUCCESS")
            SUCCESS += 1
-        if self.config.load_instruction:
+        print(f"SUCCESS={SUCCESS}")
            # action_plans = instructions
            action_plans = targets
        elif self.config.load_action_plan:
            action_plans = [ob['action_plan'] for ob in obs]
        else:
            action_plans = []
            for instruction in instructions:
                action_plan = self.plan_chain.run(instruction = instruction)
                action_plans.append(action_plan)
        print(action_plans)
        for i, init_ob in enumerate(obs):
            # for our work
            # cur_action_plan is "target object with its location"
            self.cur_action_plan = action_plans[i]
            print("use_tool_chain:", self.config.use_tool_chain)
            # Take the first action
            if self.config.use_tool_chain:                  # we will not HERE
                first_obs = self.action_maker('')
                input = {
                    'action_plan': self.cur_action_plan,
                    'init_observation': init_ob['obs_summary'],
                    'observation': first_obs,
                }
            else:
                # Get current feature
                # we are HERE
                feature = init_ob['obs']
                navigable = init_ob['candidate']
                objects = init_ob['objects']
                heading = np.rad2deg(init_ob['heading'])
                elevation = np.rad2deg(init_ob['elevation'])
                orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
                print("use_relative_angle:", self.config.use_relative_angle)
                print("use_relative_angle:", self.config.use_navigable)
                if self.config.use_relative_angle:      # True
                    feature = self.modify_heading_angles(heading, feature, navigable, objects)
                if self.config.use_navigable:           # False
                    navigable = self.get_navigable_str(heading, elevation, navigable)
                if self.config.use_relative_angle:
                    if self.config.use_navigable:
                        init_observation = f"\n\tCurrent Viewpoint:\n{feature}\n\tNavigable Viewpoints:\n{navigable}"
                    else:
                        init_observation = f"\n\tCurrent Viewpoint:\n{feature}"
                else:
                    if self.config.use_navigable:
                        init_observation = f"\n\tCurrent Orientation:\n{orientation}\n\tCurrent Viewpoint:\n{feature}\n\tNavigable Viewpoints:\n{navigable}"
                    else:
                        init_observation = f"\n\tCurrent Orientation:\n{orientation}\n\tCurrent Viewpoint:\n{feature}"
                input = {
                    'action_plan': self.cur_action_plan,            # here will be "object & its location" in our work
                    'init_observation': init_observation,           # 8 direction's observation caption & navigable point & objects
                }
            output = self.agent_executor(input)
            self.traj[i]['llm_output'] = output['output']
            self.traj[i]['action_plan'] = output['action_plan']
            # extract agent's thought from llm output
            intermediate_steps = output['intermediate_steps']
            self.traj[i]['llm_thought'] = []
            self.traj[i]['llm_observation'] = []
            for action, observation in intermediate_steps:
                thought = action.log
                self.traj[i]['llm_thought'].append(thought)
                self.traj[i]['llm_observation'].append(observation)
        return self.traj
--- a/nav_src/agent_base.py
+++ b/nav_src/agent_base.py
@ -38,15 +38,18 @@ class BaseAgent(object):
        if iters is not None:
            # For each time, it will run the first 'iters' iterations. (It was shuffled before)
            for i in range(iters):
                print(i)
                for traj in self.rollout(**kwargs):
                    self.loss = 0
                    self.results[traj['instr_id']] = traj
                    '''
                    preds_detail = self.get_results(detailed_output=True)
                    json.dump(
                        preds_detail,
                        open(os.path.join(self.config.log_dir, 'runtime.json'), 'w'),
                        sort_keys=True, indent=4, separators=(',', ': ')
                    )
                    '''
        else:   # Do a full round
            while True:
                for traj in self.rollout(**kwargs):
--- a/nav_src/env.py
+++ b/nav_src/env.py
@ -162,17 +162,26 @@ class Simulator(object):
        self.navigable_dict = {}
        for start, v in navigable_dict.items():
            self.navigable_dict[start] = {}
-            print("BEFORE: ", len(navigable_dict[start]))
+            # print("BEFORE: ", len(navigable_dict[start]))
            for to, _v in navigable_dict[start].items():
                start_region = self.node_region[scan_ID][start]
                to_region = self.node_region[scan_ID][to]
                if start_region == to_region:
                    self.navigable_dict[start][to] = _v 
-                print(start_region, to_region)
+                # print(start_region, to_region)
-            print("AFTER: ", len(self.navigable_dict[start]))
+            # print("AFTER: ", len(self.navigable_dict[start]))
        # Get candidate
        self.getCandidate()
    def getNodesInTheRoom(self):
        ans = []
        start_region = self.node_region[self.scan_ID][self.viewpoint_ID]
        for node, region_id in self.node_region[self.scan_ID].items():
            if region_id == start_region:
                ans.append(node)
        return ans
    def updateGraph(self):
        # build graph
--- a/nav_src/prompt/planner_prompt.py
+++ b/nav_src/prompt/planner_prompt.py
@ -248,11 +248,11 @@ VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an ind
 You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.
-Explore the environment and don't stay at the original point. Keep Walking! Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
+Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
-If you find the object but I haven't said you can stop. You cannot say you have finished the task! Keep exploring the nearby area.
+At each step, determine if you've reached the destination.
-
+If yes, stop and output 'Final Answer: Finished!'.
-continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
+If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
 Show your reasoning in the Thought section.
 Follow the given format and use provided tools.
@ -266,8 +266,11 @@ Instruction: the instruction describing the whole trajectory
 Initial Observation: the initial observation of the environment
 Thought: you should always think about what to do next and why
 Action: the action to take, must be one of the tools [{tool_names}]
-Action Input: "Viewpoint ID" but do not stay in the original viewpoint
+Action Input: "Viewpoint ID"
 Observation: the result of the action
 ... (this Thought/Action/Action Input/Observation can repeat N times)
 Thought: I have reached the destination, I can stop.
 Final Answer: Finished!
 ----
 Begin!
Author	SHA1	Message	Date
Ting-Jun Wang	82e2c7e053	feat: random choose the final stop node	2024-05-06 01:10:16 +08:00
Ting-Jun Wang	68330c5163	feat: evaluate llm success	2024-05-05 23:02:06 +08:00