feat: complete NavGPT in a single room (without boundary)

2024-04-29 01:45:42 +08:00 · 2024-04-29 01:45:42 +08:00 · 89081b6b21
commit 89081b6b21
parent fc0ccb1458
5 changed files with 174 additions and 22 deletions
--- a/nav_src/NavGPT.py
+++ b/nav_src/NavGPT.py
@ -10,9 +10,10 @@ from parser import parse_args
 from env import REVERIENavBatch
 from agent import NavGPTAgent

-def build_dataset(args):
+def build_dataset(args, data_limit=100):

    feat_db = ImageObservationsDB(args.obs_dir, args.obs_summary_dir, args.obj_dir)
+    print(feat_db)

    dataset_class = REVERIENavBatch

@ -25,7 +26,7 @@ def build_dataset(args):
        )
        val_env = dataset_class(
            feat_db, val_instr_data, args.connectivity_dir, args.navigable_dir,
-            batch_size=args.batch_size, seed=args.seed, name=split,
+            batch_size=args.batch_size, seed=args.seed, name=split, data_limit=data_limit
        )   # evaluation using all objects
        val_envs[split] = val_env

@ -95,7 +96,7 @@ def valid_from_file(args, val_envs):
 def main():
    args = parse_args()

-    val_envs = build_dataset(args)
+    val_envs = build_dataset(args, data_limit=100)

    if args.valid_file is not None:
        valid_from_file(args, val_envs)
--- a/nav_src/agent.py
+++ b/nav_src/agent.py
@ -45,6 +45,11 @@ FINAL_ANSWER_ACTION = "Final Answer:"
 EXCEPTION_TOOL_NAME = "_Exception"
 MAX_SCRATCHPAD_LENGTH = 7000

+FINAL_STOP_POINT = ""
+SUCCESS = 0
+TEMP_STEPS_COUNTER = 0
+STEPS_COUNTER = 0
+
 MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
    "Invalid Format: Missing 'Action:' after 'Thought:"
 )
@ -63,16 +68,19 @@ class NavGPTOutputParser(AgentOutputParser):
        return FORMAT_INSTRUCTIONS

    def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
-        includes_answer = FINAL_ANSWER_ACTION in text
+        global STEPS_COUNTER
+        global TEMP_STEPS_COUNTER
+        global SUCCESS
+        # includes_answer = FINAL_ANSWER_ACTION in text
        regex = (
            r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
        )
        action_match = re.search(regex, text, re.DOTALL)
        if action_match:
-            if includes_answer:
-                raise OutputParserException(
-                    f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
-                )
+            # if includes_answer:
+            #     raise OutputParserException(
+            #         f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
+            #     )
            action = action_match.group(1).strip()
            action_input = action_match.group(2)
            tool_input = action_input.strip(" ")
@ -80,12 +88,36 @@ class NavGPTOutputParser(AgentOutputParser):
            if tool_input.startswith("SELECT ") is False:
                tool_input = tool_input.strip('"')

-            return AgentAction(action, tool_input, text)
+            print("TEXT:", text)
+            print("ACTION: ", action_input)
+            print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")

+            TEMP_STEPS_COUNTER += 1
+            print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
+            print(f"STEPS_COUNT = {STEPS_COUNTER}")
+            print(f"SUCCESS = {SUCCESS}")
+
+
+
+            if FINAL_STOP_POINT in text:
+                STEPS_COUNTER += TEMP_STEPS_COUNTER
+                SUCCESS += 1
+                TEMP_STEPS_COUNTER = 0
+                print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
+                print(f"STEPS_COUNT = {STEPS_COUNTER}")
+                print(f"SUCCESS = {SUCCESS}")
+
+                return AgentFinish(
+                    {"output": action_input}, text
+                )
+
+            return AgentAction(action, tool_input, text)
+        '''
        elif includes_answer:
            return AgentFinish(
                {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
            )
+        '''

        if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL):
            raise OutputParserException(
@ -197,13 +229,14 @@ class NavGPTAgent(BaseAgent):

        self.output_parser = NavGPTOutputParser()
        self.agent_executor = self.create_vln_agent()
-        print("AGENT_EXECUTOR: ", type(self.agent_executor))

+        '''
        plan_prompt = PromptTemplate(
            template=PLANNER_PROMPT,
            input_variables=["instruction"],
        )
        self.plan_chain = LLMChain(llm=self.llm, prompt=plan_prompt)
+        '''
    
    def parse_action(self, llm_output: str) -> Tuple[str, str]:
        regex = r"(.*?)Final Answer:[\s]*(.*)"
@ -589,6 +622,8 @@ class NavGPTAgent(BaseAgent):
            # We will be here
            tools = [self.action_maker]
            print(tools)
+            print("TOOL NAME: ", ", ".join([tool.name for tool in tools]))
+            print("TOOL DESCRIPTION: ", [f"{tool.name}: {tool.description}" for tool in tools])
            prompt = PromptTemplate(
                template=VLN_GPT4_PROMPT if self.config.llm_model_name == 'gpt-4' else VLN_GPT35_PROMPT,
                input_variables=["action_plan", "init_observation", "agent_scratchpad"],
@ -661,6 +696,16 @@ class NavGPTAgent(BaseAgent):
            obs = self.env.reset()
        else:
            obs = self.env._get_obs()
+
+        global FINAL_STOP_POINT
+        global TEMP_STEPS_COUNTER
+
+        FINAL_STOP_POINT = obs[0]['stop']
+
+        if TEMP_STEPS_COUNTER != 0:
+            TEMP_STEPS_COUNTER = 0
+
+        print(f"HAVE SET FINAL_STOP_POINT = {FINAL_STOP_POINT}")
        
        print(len(obs))

@ -771,3 +816,110 @@ class NavGPTAgent(BaseAgent):
                self.traj[i]['llm_observation'].append(observation)

        return self.traj
+
+class RandomAgent(BaseAgent):
+    def __init__(
+            self, 
+            env: REVERIENavBatch, 
+            config: Namespace):
+        """
+        Initialize the LLM Navigation Agent.
+
+        Args:
+            env: The Matterport3D environment.
+            config: The configuration.
+        """
+        super().__init__(env)
+        self.config = config
+
+
+    def init_trajecotry(self, obs: List[dict]):
+        """Initialize the trajectory with the given observation."""
+        # Record the navigation path
+        self.traj = [{
+            'instr_id': ob['instr_id'],
+            'path': [[ob['start']]],
+            'details': [],
+        } for ob in obs]
+        # Record the history of actions taken
+
+
+    def make_equiv_action(self, actions: List[str]) -> str:
+        """
+        Interface between Panoramic view and Egocentric view
+        Take in the next viewpoint ID and move the agent to that viewpoint
+        return the turned angle and new observation
+        """
+        def normalize_angle(angle):
+            while angle > 180:
+                angle -= 360
+            while angle <= -180:
+                angle += 360
+            return angle
+
+        def angle_to_left_right(angle):
+            return f"left {-angle:.2f}" if angle < 0 else f"right {angle:.2f}"
+        
+        # Get current agent facing angle
+        cur_obs = self.env._get_obs()[0]
+        cur_heading = np.rad2deg(cur_obs['heading'])
+        # Make the action
+        new_obs = self.env.step(actions)[0]
+        new_heading = np.rad2deg(new_obs['heading'])
+        # Record the trajectory
+        self.traj[0]['path'].append(self.env.env.sims[0].gmap.bfs_shortest_path(cur_obs['viewpoint'], actions[0])[1:])
+        # Calculate the turned angle
+        turned_angle = new_heading - cur_heading
+        # Generate action description
+        cur_heading = angle_to_left_right(normalize_angle(cur_heading))
+        new_heading = angle_to_left_right(normalize_angle(new_heading))
+        action_description = f'Turn heading direction {turned_angle:.2f} degrees from {cur_heading} to {new_heading}.'
+        return action_description, new_obs
+
+    def rollout(self, reset=True):
+        if reset:  # Reset env
+            obs = self.env.reset()
+        else:
+            obs = self.env._get_obs()
+
+        global FINAL_STOP_POINT
+        global TEMP_STEPS_COUNTER
+
+        FINAL_STOP_POINT = obs[0]['stop']
+
+        if TEMP_STEPS_COUNTER != 0:
+            TEMP_STEPS_COUNTER = 0
+
+        print(obs[0].keys())
+        print(obs[0]['obs'])
+        print(obs[0]['obs_summary'])
+        print(obs[0]['objects'])
+        print(obs[0]['instr_id'])
+        print(obs[0]['scan'])
+        print(obs[0]['viewpoint'])
+        print(obs[0]['heading'])
+        print(obs[0]['elevation'])
+        print(obs[0]['candidate'])
+        print(obs[0]['instruction'])
+        print(obs[0]['gt_path'])
+        print(obs[0]['path_id'])
+        print(obs[0]['stop'])
+        print(obs[0]['start'])
+        print(obs[0]['target'])
+
+
+        print("==")
+
+        # Initialize the trajectory
+        self.init_trajecotry(obs)
+
+        for i, init_ob in enumerate(obs):
+            
+                navigable = init_ob['candidate']
+                heading = np.rad2deg(init_ob['heading'])
+                elevation = np.rad2deg(init_ob['elevation'])
+                orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
+
+            output = self.agent_executor(input)
+
+        return self.traj
--- a/nav_src/env.py
+++ b/nav_src/env.py
@ -141,7 +141,7 @@ class REVERIENavBatch(object):

    def __init__(
        self, view_db, instr_data, connectivity_dir, navigable_dir,
-        batch_size=1, seed=0, name=None,
+        batch_size=1, seed=0, name=None, data_limit=100
    ):
        self.env = EnvBatch(navigable_dir, feat_db=view_db, batch_size=batch_size)
        self.data = instr_data
@ -157,6 +157,8 @@ class REVERIENavBatch(object):
        random.seed(self.seed)
        random.shuffle(self.data)

+        self.data = self.data[:data_limit]
+
        self.ix = 0
        self._load_nav_graphs()
        
@ -263,7 +265,7 @@ class REVERIENavBatch(object):
        stops = [item['stop'] for item in self.batch]
        starts = [item['start'] for item in self.batch]
        targets = [item['target'] for item in self.batch]
-        self.env.newEpisodes(scanIds, viewpointIds, headings, stops, starts, targets)
+        self.env.newEpisodes(scanIds, starts, headings, stops, starts, targets)
        return self._get_obs()

    def step(self, next_viewpoint_IDs):
--- a/nav_src/parser.py
+++ b/nav_src/parser.py
@ -18,7 +18,7 @@ def parse_args():
    # parser.add_argument('--llm_model_name', type=str, default='gpt-4', help='llm model name')
    # parser.add_argument('--llm_model_name', type=str, default='LlaMA-2-13b', help='llm model name')
    parser.add_argument('--batch_size', type=int, default=1)
-    parser.add_argument('--max_iterations', type=int, default=10)
+    parser.add_argument('--max_iterations', type=int, default=25)

    # General config
    parser.add_argument('--iters', type=int, default=10, help='number of iterations to run')
--- a/nav_src/prompt/planner_prompt.py
+++ b/nav_src/prompt/planner_prompt.py
@ -248,11 +248,11 @@ VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an ind

 You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.

-Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
+Explore the environment and don't stay at the original point. Keep Walking! Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.

-At each step, determine if you've reached the destination.
-If yes, stop and output 'Final Answer: Finished!'.
-If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
+If you find the object but I haven't said you can stop. You cannot say you have finished the task! Keep exploring the nearby area.
+
+continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
 Show your reasoning in the Thought section.

 Follow the given format and use provided tools.
@ -266,15 +266,12 @@ Instruction: the instruction describing the whole trajectory
 Initial Observation: the initial observation of the environment
 Thought: you should always think about what to do next and why
 Action: the action to take, must be one of the tools [{tool_names}]
-Action Input: "Viewpoint ID"
+Action Input: "Viewpoint ID" but do not stay in the original viewpoint
 Observation: the result of the action
-... (this Thought/Action/Action Input/Observation can repeat N times)
-Thought: I have reached the destination, I can stop.
-Final Answer: Finished!
 ----

 Begin!

 Instruction: {action_plan}
 Initial Observation: {init_observation}
-Thought: I should start navigation according to the instruction, {agent_scratchpad}"""
+Thought: I should start navigation according to the instruction, {agent_scratchpad}"""