feat: Add bboxes confidences

2024-12-15 16:44:33 +08:00 · 2024-12-15 16:44:33 +08:00 · bc6cb9a9f8
commit bc6cb9a9f8
parent 1547974692
4 changed files with 146 additions and 30 deletions
--- a/nav_src/NavGPT.py
+++ b/nav_src/NavGPT.py
@ -62,7 +62,14 @@ def valid(args, val_envs):
            open(os.path.join(args.log_dir, "detail_%s.json" % (env_name)), 'w'),
            sort_keys=True, indent=4, separators=(',', ': ')
            )
        print(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)))
        json.dump(
            preds,
            open(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)), 'w'),
            sort_keys=True, indent=4, separators=(',', ': ')
        )
        '''
        if 'test' not in env_name:
            score_summary, _ = env.eval_metrics(preds)
            loss_str = "Env name: %s" % env_name
@ -70,11 +77,7 @@ def valid(args, val_envs):
                loss_str += ', %s: %.2f' % (metric, val)
            write_to_record_file(loss_str+'\n', record_file)
-        json.dump(
+        '''
            preds,
            open(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)), 'w'),
            sort_keys=True, indent=4, separators=(',', ': ')
        )
 def valid_from_file(args, val_envs):
--- a/nav_src/agent.py
+++ b/nav_src/agent.py
@ -47,14 +47,18 @@ FINAL_ANSWER_ACTION = "Final Answer:"
 EXCEPTION_TOOL_NAME = "_Exception"
 MAX_SCRATCHPAD_LENGTH = 7000
 CLIP_TARGET = ""
 FINAL_STOP_POINT = ""
 FINAL_STATE = ""
 SUCCESS = 0
 TEMP_STEPS_COUNTER = 0
 STEPS_COUNTER = 0
 NOW_LOCATION = None
 FOUND_BBOX = ""
 LAST_VP = ""
-THRESHOLD = 0.2812
+THRESHOLD = 0.278
 SCAN = ""
 MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
    "Invalid Format: Missing 'Action:' after 'Thought:"
@ -69,6 +73,15 @@ FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE = (
 print("Load CLIP confidence file...")
 confidences = load_json('../datasets/REVERIE/annotations/confidence.json')
 print("Loaded")
 print()
 print("Load distance file...")
 distances = {}
 for SCAN in ['2azQ1b91cZZ', 'X7HyMhZNoso', 'Z6MFQCViBuw', 'TbHJrupSAjP', 'EU6Fwq7SyZv', 'zsNo4HB9uLZ', 'x8F5xyUWy9e', '8194nk5LbLH', 'oLBMNvg9in8', 'QUCTc6BB5sX']:
    scan_distance = load_json('/data/base_dir/v1/scans/{}/output.json'.format(SCAN))
    distances[SCAN] = scan_distance
 print("Loaded")
 print()
 def is_found(scan, vp, clip_target):
    found = False
@ -91,6 +104,10 @@ class NavGPTOutputParser(AgentOutputParser):
        global SUCCESS
        global NOW_LOCATION
        global FINAL_STATE
        global CLIP_TARGET
        global SCAN
        global LAST_VP
        global FOUND_BBOX
        includes_answer = FINAL_ANSWER_ACTION in text
        regex = (
            r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
@ -104,6 +121,28 @@ class NavGPTOutputParser(AgentOutputParser):
            action = action_match.group(1).strip()
            action_input = action_match.group(2)
            tool_input = action_input.strip(" ")
            # confidence to stop
            if tool_input in confidences[SCAN]:
                found = False
                max_bbox, max_bbox_confidence = "", 0
                for bbox in confidences[SCAN][tool_input]:
                    confidence = confidences[SCAN][tool_input][bbox][CLIP_TARGET]
                    if confidence >= THRESHOLD and confidence >= max_bbox_confidence:
                        max_bbox = bbox
                        max_bbox_confidence = confidence
                        FOUND_BBOX = bbox
                        found = True
                if found:
                    FINAL_STATE = 'stop'
                    LAST_VP = tool_input
                    print("=============== FOUND OBJECT IN CLIP ===================")
                    return AgentFinish(
                        {"output": tool_input}, text
                    )
            # ensure if its a well formed SQL query we don't remove any trailing " chars
            if tool_input.startswith("SELECT ") is False:
                tool_input = tool_input.strip('"')
@ -113,14 +152,18 @@ class NavGPTOutputParser(AgentOutputParser):
            print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")
            # TEMP_STEPS_COUNTER += 1
            '''
            print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
            print(f"STEPS_COUNT = {STEPS_COUNTER}")
            print(f"SUCCESS = {SUCCESS}")
            '''
            NOW_LOCATION = tool_input
            TEMP_STEPS_COUNTER += 1
            print(f"NOW_LOCATION = {NOW_LOCATION}")
            print(f'ACTION={action}, TOOL_INPUT={tool_input}, TEXT={text}')
            '''
            if FINAL_STOP_POINT in text:
@ -138,15 +181,17 @@ class NavGPTOutputParser(AgentOutputParser):
            return AgentAction(action, tool_input, text)
        elif includes_answer:
-            is_STOP = 'Finished' in text
+            # is_STOP = 'Finished' in text
-            print("FINAL: ", is_STOP)
+            # print("FINAL: ", is_STOP)
            '''
            if is_STOP:
                FINAL_STATE = 'stop'
            else:
                FINAL_STATE = 'not found'
            '''
-
+            '''
            if NOW_LOCATION == FINAL_STOP_POINT:
                STEPS_COUNTER += TEMP_STEPS_COUNTER
                TEMP_STEPS_COUNTER = 0
@ -159,6 +204,8 @@ class NavGPTOutputParser(AgentOutputParser):
                print(f"{FINAL_STOP_POINT}_{type(FINAL_STOP_POINT)}")
                print(f"SUCCESS = {SUCCESS}")
                print(f"STEPS_COUNTER  = {STEPS_COUNTER}")
            '''
            FINAL_STATE = 'not found'
            return AgentFinish(
                {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
            )
@ -749,6 +796,10 @@ class NavGPTAgent(BaseAgent):
        global STEPS_COUNTER
        global FINAL_STATE
        global NOW_LOCATION
        global SCAN
        global CLIP_TARGET
        global LAST_VP
        global FOUND_BBOX
        FINAL_STOP_POINT = obs[0]['gt_path'][-1]
        FINAL_STATE = ""
@ -765,18 +816,22 @@ class NavGPTAgent(BaseAgent):
        print(obs[0]['obs_summary'])
        print(obs[0]['objects'])
        print(obs[0]['scan'])
-        print(obs[0]['viewpoint'])
+        print('now:', obs[0]['viewpoint'])
        print(obs[0]['heading'])
        print(obs[0]['elevation'])
        print(obs[0]['candidate'])
        print(obs[0]['instruction'])
-        print(obs[0]['gt_path'])
+        print('path:', obs[0]['gt_path'])
        print(obs[0]['path_id'])
-        print(obs[0]['start'])
+        print('start:', obs[0]['start'])
        print(obs[0]['target'])
        print(obs[0]['new_reverie_id'])
        print(obs[0]['clip_target'])
        NOW_LOCATION = obs[0]['start']
        CLIP_TARGET = obs[0]['clip_target']
        SCAN = obs[0]['scan']
        LAST_VP = ""
        FOUND_BBOX = ""
@ -856,6 +911,9 @@ class NavGPTAgent(BaseAgent):
                    'init_observation': init_observation,           # 8 direction's observation caption & navigable point & objects
                }
            output = self.agent_executor(input)
            if LAST_VP != "":
                turned_angle, new_obs = self.make_equiv_action([LAST_VP])
            if 'stop' in FINAL_STATE:
                self.traj[i]['final_state'] = 'stop'
@ -864,6 +922,8 @@ class NavGPTAgent(BaseAgent):
            self.traj[i]['llm_output'] = output['output']
            self.traj[i]['action_plan'] = output['action_plan']
            self.traj[i]['bbox'] = FOUND_BBOX
            # extract agent's thought from llm output
            intermediate_steps = output['intermediate_steps']
            self.traj[i]['llm_thought'] = []
@ -873,4 +933,7 @@ class NavGPTAgent(BaseAgent):
                self.traj[i]['llm_thought'].append(thought)
                self.traj[i]['llm_observation'].append(observation)
        print("TRAJ: {}".format(self.traj[0]['path']))
        print(f"status={FINAL_STATE}, FOUND_BBOX={FOUND_BBOX}")
        print()
        return self.traj
--- a/nav_src/env.py
+++ b/nav_src/env.py
@ -14,6 +14,17 @@ from utils.graph_utils import NavGraph
 ERROR_MARGIN = 3.0
 obj2vps = {}
 bbox_data = json.load(open('/data/Matterport3DSimulator-duet/VLN-DUET/datasets/REVERIE/annotations/BBoxes.json'))
 for scanvp, value in bbox_data.items():
    scan, vp = scanvp.split('_')
    # for all visible objects at that viewpoint
    for objid, objinfo in value.items():
        if objinfo['visible_pos']:
            # if such object not already in the dict
            obj2vps.setdefault(scan+'_'+objid, [])
            obj2vps[scan+'_'+objid].append(vp)
 def load_floorplan():
    region_label_lookup = load_region_label_lookup()
@ -118,6 +129,8 @@ def load_region_label_lookup():
    }
    return region_label_lookup
 with open('./node_region.json') as fp:
    node_region = json.load(fp)
 class Simulator(object):
    ''' A simple simulator in Matterport3D environment '''
@ -157,8 +170,9 @@ class Simulator(object):
        # Load navigable dict
        navigable_path = os.path.join(self.navigable_dir, self.scan_ID + '_navigable.json')
        with open(navigable_path, 'r') as f:
-            navigable_dict = json.load(f)
+            self.navigable_dict = json.load(f)
        '''
        self.navigable_dict = {}
        for start, v in navigable_dict.items():
            self.navigable_dict[start] = {}
@ -170,6 +184,7 @@ class Simulator(object):
                    self.navigable_dict[start][to] = _v 
                # print(start_region, to_region)
            # print("AFTER: ", len(self.navigable_dict[start]))
        '''
        # Get candidate
        self.getCandidate()
@ -406,7 +421,7 @@ class REVERIENavBatch(object):
                near_d = d
        return near_id
-    def _eval_item(self, scan, pred_path, gt_path):
+    def _eval_item(self, scan, pred_path, gt_path, gt_found, found, gt_objid):
        scores = {}
        shortest_distances = self.shortest_distances[scan]
@ -425,9 +440,27 @@ class REVERIENavBatch(object):
        gt_lengths = np.sum([shortest_distances[a][b] for a, b in zip(gt_path[:-1], gt_path[1:])])
-        scores['success'] = float(scores['nav_error'] < ERROR_MARGIN)
+        scores['found_success'] = float(gt_found == found)
-        # scores['spl'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01)
+
-        scores['oracle_success'] = float(scores['oracle_error'] < ERROR_MARGIN)
+        goal_viewpoints = set(obj2vps['%s_%s'%(scan, str(gt_objid))])
        pred_stop_region = node_region[scan][path[-1]]
        gt_stop_region = node_region[scan][gt_path[-1]]
        # scores['success'] = float(scores['nav_error'] < ERROR_MARGIN)
        scores['success'] = float(path[-1] in goal_viewpoints)
        scores['room_success'] = float(gt_stop_region == pred_stop_region)
        # scores['oracle_success'] = float(scores['oracle_error'] < ERROR_MARGIN)
        scores['oracle_success'] = float(any(x in goal_viewpoints for x in path))
        scores['spl'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01)
        scores['sspl_1'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
        scores['sspl_2'] = scores['room_success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
        scores['sspl_3'] = scores['oracle_success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
        scores['ss_1'] = scores['success'] * scores['found_success']
        scores['ss_2'] = scores['room_success'] * scores['found_success']
        scores['ss_3'] = scores['oracle_success'] * scores['found_success']
        scores.update(
            cal_dtw(shortest_distances, path, gt_path, scores['success'], ERROR_MARGIN)
@ -445,8 +478,9 @@ class REVERIENavBatch(object):
        for item in preds:
            instr_id = item['instr_id']
            traj = item['trajectory']
            obj_id = instr_id.split('_')[1]
            scan, gt_traj = self.gt_trajs[instr_id]
-            traj_scores = self._eval_item(scan, traj, gt_traj)
+            traj_scores = self._eval_item(scan, traj, gt_traj, item['gt_found'], item['found'], obj_id)
            for k, v in traj_scores.items():
                metrics[k].append(v)
            metrics['instr_id'].append(instr_id)
@ -458,8 +492,16 @@ class REVERIENavBatch(object):
            'nav_error': np.mean(metrics['nav_error']),
            'oracle_error': np.mean(metrics['oracle_error']),
            'sr': np.mean(metrics['success']) * 100,
            'room_success': np.mean(metrics['room_success']) * 100,
            'found_success': np.mean(metrics['found_success']) * 100,
            'oracle_sr': np.mean(metrics['oracle_success']) * 100,
-            # 'spl': np.mean(metrics['spl']) * 100,
+            'spl': np.mean(metrics['spl']) * 100,
            'sspl_1': np.mean(metrics['sspl_1']) * 100,
            'sspl_2': np.mean(metrics['sspl_2']) * 100,
            'sspl_3': np.mean(metrics['sspl_3']) * 100,
            'ss_1': np.mean(metrics['ss_1']) * 100,
            'ss_2': np.mean(metrics['ss_2']) * 100,
            'ss_3': np.mean(metrics['ss_3']) * 100,
            'nDTW': np.mean(metrics['nDTW']) * 100,
            'SDTW': np.mean(metrics['SDTW']) * 100,
            'CLS': np.mean(metrics['CLS']) * 100,
--- a/nav_src/prompt/planner_prompt.py
+++ b/nav_src/prompt/planner_prompt.py
@ -244,16 +244,24 @@ Instruction: {action_plan}
 Initial Observation: {init_observation}
 Thought: I should start navigation according to the instruction, {agent_scratchpad}"""
-VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an indoor environment to reach a target viewpoint based on a given instruction, performing the Vision and Language Navigation (VLN) task. The instruction may be either feasible or infeasible (i.e., the specified object might not be found in the environment). You will move among static positions within a pre-defined graph, aiming for the nearest position to the object if the object is present.
+VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate in an indoor environment to reach a target viewpoint based on a given instruction, performing the Vision and Language Navigation (VLN) task.
 The instruction will let you find all the target objects in a room. You should have a good stratedy to check all the object in the shortest path in the room.
 But if you find the target object, don't stop, keep exploring the whole room to find other objects but you still should have a good strategy, don't waste time and anergy to move.
 You will move among static positions within a pre-defined graph, aiming for the nearest position to the object if the object is present.
 You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.
-Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach the instructed destination as closely as possible. The task will fail if you do not reach within 3 meters of the instructed destination, even if it is observable. Therefore, if the destination is visible but you do not see the object within 3 meters, move closer.
+Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs and the most important thing is that you should not leave the room so you better not move closed to the door. 
-At each step, determine if you've reached the destination(If the object is more than three meters away from you, you are not considered to have reached the destination).
+
-If yes, stop and output 'Final Answer: Finished!'.
+Notice: You should have a good strategy to check whether the target object exists in this room.
-If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
+
-And if you explored all room, you think this object doesn't exist in this room. stop and output 'Final Answer: Not found!'.
+If you think you are moving in circles, please stop and think whether any other objects may be hiden. If no, please output 'Final Answer: Not found'.
-If you find another room seems to be the closest match to the instruction but no viewpoint can access to this room. Please output "Final Answer: Not found!"
+
 Continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
 And if you explored all room(no other viewpoint to move to), stop and output 'Final Answer: Not found!'.
 Show your reasoning in the Thought section.
 Follow the given format and use provided tools.
@ -270,13 +278,13 @@ Action: the action to take, must be one of the tools [{tool_names}]
 Action Input: "Viewpoint ID"
 Observation: the result of the action
 ... (this Thought/Action/Action Input/Observation can repeat N times)
-Thought: I have reached the destination, I can stop.
+Thought: I found my target object, but I should check whether any other objects may be hidden.
 Final Answer: Finished!
 or 
-Thought: I cannot find the object in this room, I should stop.
+Thought: I checked that no objects are hidden, I can stop.
 Final Answer: Not found!
 ----
 Begin!