feat: evaluate llm success

2024-05-05 23:02:06 +08:00 · 2024-05-05 23:02:06 +08:00 · 68330c5163
commit 68330c5163
parent abe343928b
3 changed files with 34 additions and 17 deletions
--- a/nav_src/agent.py
+++ b/nav_src/agent.py
@ -49,6 +49,7 @@ FINAL_STOP_POINT = ""
 SUCCESS = 0
 TEMP_STEPS_COUNTER = 0
 STEPS_COUNTER = 0
 NOW_LOCATION = None
 MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
    "Invalid Format: Missing 'Action:' after 'Thought:"
@ -71,16 +72,17 @@ class NavGPTOutputParser(AgentOutputParser):
        global STEPS_COUNTER
        global TEMP_STEPS_COUNTER
        global SUCCESS
-        # includes_answer = FINAL_ANSWER_ACTION in text
+        global NOW_LOCATION
        includes_answer = FINAL_ANSWER_ACTION in text
        regex = (
            r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
        )
        action_match = re.search(regex, text, re.DOTALL)
        if action_match:
-            # if includes_answer:
+            if includes_answer:
-            #     raise OutputParserException(
+                raise OutputParserException(
-            #         f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
+                    f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
-            #     )
+                )
            action = action_match.group(1).strip()
            action_input = action_match.group(2)
            tool_input = action_input.strip(" ")
@ -92,13 +94,16 @@ class NavGPTOutputParser(AgentOutputParser):
            print("ACTION: ", action_input)
            print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")
-            TEMP_STEPS_COUNTER += 1
+            # TEMP_STEPS_COUNTER += 1
            print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
            print(f"STEPS_COUNT = {STEPS_COUNTER}")
            print(f"SUCCESS = {SUCCESS}")
            NOW_LOCATION = tool_input
            print(f"NOW_LOCATION = {NOW_LOCATION}")
            '''
            if FINAL_STOP_POINT in text:
                STEPS_COUNTER += TEMP_STEPS_COUNTER
                SUCCESS += 1
@ -110,14 +115,22 @@ class NavGPTOutputParser(AgentOutputParser):
                return AgentFinish(
                    {"output": action_input}, text
                )
            '''
            return AgentAction(action, tool_input, text)
        '''
        elif includes_answer:
            if NOW_LOCATION == FINAL_STOP_POINT:
                SUCCESS += 1
                print(f"SUCCESS = {SUCCESS}")
            else:
                print("NOT SUCCESS")
                print(f"{NOW_LOCATION}_{type(NOW_LOCATION)}")
                print(f"{FINAL_STOP_POINT}_{type(FINAL_STOP_POINT)}")
                print(f"SUCCESS = {SUCCESS}")
            return AgentFinish(
                {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
            )
        '''
        if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL):
            raise OutputParserException(
@ -699,6 +712,7 @@ class NavGPTAgent(BaseAgent):
        global FINAL_STOP_POINT
        global TEMP_STEPS_COUNTER
        global NOW_LOCATION
        FINAL_STOP_POINT = obs[0]['stop']
@ -725,6 +739,7 @@ class NavGPTAgent(BaseAgent):
        print(obs[0]['stop'])
        print(obs[0]['start'])
        print(obs[0]['target'])
        NOW_LOCATION = obs[0]['start']
        print("==")
@ -816,4 +831,3 @@ class NavGPTAgent(BaseAgent):
                self.traj[i]['llm_observation'].append(observation)
        return self.traj
--- a/nav_src/env.py
+++ b/nav_src/env.py
@ -162,14 +162,14 @@ class Simulator(object):
        self.navigable_dict = {}
        for start, v in navigable_dict.items():
            self.navigable_dict[start] = {}
-            print("BEFORE: ", len(navigable_dict[start]))
+            # print("BEFORE: ", len(navigable_dict[start]))
            for to, _v in navigable_dict[start].items():
                start_region = self.node_region[scan_ID][start]
                to_region = self.node_region[scan_ID][to]
                if start_region == to_region:
                    self.navigable_dict[start][to] = _v 
-                print(start_region, to_region)
+                # print(start_region, to_region)
-            print("AFTER: ", len(self.navigable_dict[start]))
+            # print("AFTER: ", len(self.navigable_dict[start]))
        # Get candidate
        self.getCandidate()
--- a/nav_src/prompt/planner_prompt.py
+++ b/nav_src/prompt/planner_prompt.py
@ -248,11 +248,11 @@ VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an ind
 You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.
-Explore the environment and don't stay at the original point. Keep Walking! Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
+Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
-If you find the object but I haven't said you can stop. You cannot say you have finished the task! Keep exploring the nearby area.
+At each step, determine if you've reached the destination.
-
+If yes, stop and output 'Final Answer: Finished!'.
-continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
+If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
 Show your reasoning in the Thought section.
 Follow the given format and use provided tools.
@ -266,8 +266,11 @@ Instruction: the instruction describing the whole trajectory
 Initial Observation: the initial observation of the environment
 Thought: you should always think about what to do next and why
 Action: the action to take, must be one of the tools [{tool_names}]
-Action Input: "Viewpoint ID" but do not stay in the original viewpoint
+Action Input: "Viewpoint ID"
 Observation: the result of the action
 ... (this Thought/Action/Action Input/Observation can repeat N times)
 Thought: I have reached the destination, I can stop.
 Final Answer: Finished!
 ----
 Begin!