Compare commits

...

5 Commits

6 changed files with 235 additions and 43 deletions

View File

@ -11,7 +11,7 @@ def dump_json(data, filename):
json.dump(data, fp) json.dump(data, fp)
for f in os.listdir(): for f in os.listdir():
if 'navgpt' in f: if 'unseen' in f:
print(f) print(f)
data = load_json(f) data = load_json(f)
@ -21,7 +21,12 @@ for f in os.listdir():
new_i = i.copy() new_i = i.copy()
new_i['instruction'] = instr new_i['instruction'] = instr
# new_i['instr_id'] = f'{new_i["id"]}_{index}' # new_i['instr_id'] = f'{new_i["id"]}_{index}'
new_i['new_reverie_id'] = f'{new_i["new_reverie_id"]}_{index}' new_i['new_reverie_id'] = f'{new_i["id"]}_{index}'
new_i['gt_found'] = i['found'][index]
new_i['target'] = i['target_objects'][index]
new_i['clip_target'] = i['clip_target'][index]
del new_i['found']
del new_i['target_objects']
del new_i['instructions'] del new_i['instructions']
new_data.append(new_i) new_data.append(new_i)

View File

@ -62,7 +62,14 @@ def valid(args, val_envs):
open(os.path.join(args.log_dir, "detail_%s.json" % (env_name)), 'w'), open(os.path.join(args.log_dir, "detail_%s.json" % (env_name)), 'w'),
sort_keys=True, indent=4, separators=(',', ': ') sort_keys=True, indent=4, separators=(',', ': ')
) )
print(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)))
json.dump(
preds,
open(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)), 'w'),
sort_keys=True, indent=4, separators=(',', ': ')
)
'''
if 'test' not in env_name: if 'test' not in env_name:
score_summary, _ = env.eval_metrics(preds) score_summary, _ = env.eval_metrics(preds)
loss_str = "Env name: %s" % env_name loss_str = "Env name: %s" % env_name
@ -70,11 +77,7 @@ def valid(args, val_envs):
loss_str += ', %s: %.2f' % (metric, val) loss_str += ', %s: %.2f' % (metric, val)
write_to_record_file(loss_str+'\n', record_file) write_to_record_file(loss_str+'\n', record_file)
json.dump( '''
preds,
open(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)), 'w'),
sort_keys=True, indent=4, separators=(',', ': ')
)
def valid_from_file(args, val_envs): def valid_from_file(args, val_envs):

View File

@ -26,6 +26,8 @@ from langchain.schema import (
) )
from langchain.base_language import BaseLanguageModel from langchain.base_language import BaseLanguageModel
from data_utils import load_json
from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS
from prompt.planner_prompt import ( from prompt.planner_prompt import (
ACTION_PROMPT, ACTION_PROMPT,
@ -45,12 +47,18 @@ FINAL_ANSWER_ACTION = "Final Answer:"
EXCEPTION_TOOL_NAME = "_Exception" EXCEPTION_TOOL_NAME = "_Exception"
MAX_SCRATCHPAD_LENGTH = 7000 MAX_SCRATCHPAD_LENGTH = 7000
CLIP_TARGET = ""
FINAL_STOP_POINT = "" FINAL_STOP_POINT = ""
FINAL_STATE = "" FINAL_STATE = ""
SUCCESS = 0 SUCCESS = 0
TEMP_STEPS_COUNTER = 0 TEMP_STEPS_COUNTER = 0
STEPS_COUNTER = 0 STEPS_COUNTER = 0
NOW_LOCATION = None NOW_LOCATION = None
FOUND_BBOX = ""
LAST_VP = ""
THRESHOLD = 0.75
SCAN = ""
MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = ( MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
"Invalid Format: Missing 'Action:' after 'Thought:" "Invalid Format: Missing 'Action:' after 'Thought:"
@ -62,6 +70,32 @@ FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE = (
"Parsing LLM output produced both a final answer and a parse-able action:" "Parsing LLM output produced both a final answer and a parse-able action:"
) )
print("Load GroundingDINO confidence file...")
confidences = load_json('../datasets/REVERIE/annotations/grounding_dino_confidence.json')
print("Loaded")
'''
print("Load CLIP confidence file...")
confidences = load_json('../datasets/REVERIE/annotations/confidence.json')
print("Loaded")
'''
print()
print("Load distance file...")
distances = {}
for SCAN in ['2azQ1b91cZZ', 'X7HyMhZNoso', 'Z6MFQCViBuw', 'TbHJrupSAjP', 'EU6Fwq7SyZv', 'zsNo4HB9uLZ', 'x8F5xyUWy9e', '8194nk5LbLH', 'oLBMNvg9in8', 'QUCTc6BB5sX']:
scan_distance = load_json('/data/base_dir/v1/scans/{}/output.json'.format(SCAN))
distances[SCAN] = scan_distance
print("Loaded")
print()
def is_found(scan, vp, clip_target):
found = False
for obj in confidences[scan][vp]:
prob = confidences[scan][vp][obj][clip_target]
if prob >= THRESHOLD:
found = True
return found
class NavGPTOutputParser(AgentOutputParser): class NavGPTOutputParser(AgentOutputParser):
"""MRKL Output parser for the chat agent.""" """MRKL Output parser for the chat agent."""
@ -75,6 +109,10 @@ class NavGPTOutputParser(AgentOutputParser):
global SUCCESS global SUCCESS
global NOW_LOCATION global NOW_LOCATION
global FINAL_STATE global FINAL_STATE
global CLIP_TARGET
global SCAN
global LAST_VP
global FOUND_BBOX
includes_answer = FINAL_ANSWER_ACTION in text includes_answer = FINAL_ANSWER_ACTION in text
regex = ( regex = (
r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?" r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
@ -88,6 +126,28 @@ class NavGPTOutputParser(AgentOutputParser):
action = action_match.group(1).strip() action = action_match.group(1).strip()
action_input = action_match.group(2) action_input = action_match.group(2)
tool_input = action_input.strip(" ") tool_input = action_input.strip(" ")
# confidence to stop
if tool_input in confidences[SCAN]:
found = False
max_bbox, max_bbox_confidence = "", 0
for bbox in confidences[SCAN][tool_input][CLIP_TARGET]:
confidence = bbox['score']
if confidence >= THRESHOLD and confidence >= max_bbox_confidence:
max_bbox = bbox
max_bbox_confidence = confidence
FOUND_BBOX = bbox
found = True
if found:
FINAL_STATE = 'stop'
LAST_VP = tool_input
print("=============== FOUND OBJECT IN CLIP ===================")
return AgentFinish(
{"output": tool_input}, text
)
# ensure if its a well formed SQL query we don't remove any trailing " chars # ensure if its a well formed SQL query we don't remove any trailing " chars
if tool_input.startswith("SELECT ") is False: if tool_input.startswith("SELECT ") is False:
tool_input = tool_input.strip('"') tool_input = tool_input.strip('"')
@ -97,14 +157,18 @@ class NavGPTOutputParser(AgentOutputParser):
print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}") print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")
# TEMP_STEPS_COUNTER += 1 # TEMP_STEPS_COUNTER += 1
'''
print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}") print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
print(f"STEPS_COUNT = {STEPS_COUNTER}") print(f"STEPS_COUNT = {STEPS_COUNTER}")
print(f"SUCCESS = {SUCCESS}") print(f"SUCCESS = {SUCCESS}")
'''
NOW_LOCATION = tool_input NOW_LOCATION = tool_input
TEMP_STEPS_COUNTER += 1 TEMP_STEPS_COUNTER += 1
print(f"NOW_LOCATION = {NOW_LOCATION}") print(f"NOW_LOCATION = {NOW_LOCATION}")
print(f'ACTION={action}, TOOL_INPUT={tool_input}, TEXT={text}')
''' '''
if FINAL_STOP_POINT in text: if FINAL_STOP_POINT in text:
@ -122,15 +186,17 @@ class NavGPTOutputParser(AgentOutputParser):
return AgentAction(action, tool_input, text) return AgentAction(action, tool_input, text)
elif includes_answer: elif includes_answer:
is_STOP = 'Finished' in text # is_STOP = 'Finished' in text
print("FINAL: ", is_STOP) # print("FINAL: ", is_STOP)
'''
if is_STOP: if is_STOP:
FINAL_STATE = 'stop' FINAL_STATE = 'stop'
else: else:
FINAL_STATE = 'not found' FINAL_STATE = 'not found'
'''
'''
if NOW_LOCATION == FINAL_STOP_POINT: if NOW_LOCATION == FINAL_STOP_POINT:
STEPS_COUNTER += TEMP_STEPS_COUNTER STEPS_COUNTER += TEMP_STEPS_COUNTER
TEMP_STEPS_COUNTER = 0 TEMP_STEPS_COUNTER = 0
@ -143,6 +209,8 @@ class NavGPTOutputParser(AgentOutputParser):
print(f"{FINAL_STOP_POINT}_{type(FINAL_STOP_POINT)}") print(f"{FINAL_STOP_POINT}_{type(FINAL_STOP_POINT)}")
print(f"SUCCESS = {SUCCESS}") print(f"SUCCESS = {SUCCESS}")
print(f"STEPS_COUNTER = {STEPS_COUNTER}") print(f"STEPS_COUNTER = {STEPS_COUNTER}")
'''
FINAL_STATE = 'not found'
return AgentFinish( return AgentFinish(
{"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
) )
@ -338,7 +406,9 @@ class NavGPTAgent(BaseAgent):
rel_viewpoint_heading = viewpoint_heading - heading_angle rel_viewpoint_heading = viewpoint_heading - heading_angle
rel_viewpoint_heading = normalize_angle(rel_viewpoint_heading) rel_viewpoint_heading = normalize_angle(rel_viewpoint_heading)
rel_viewpoint_heading = angle_to_left_right(rel_viewpoint_heading) rel_viewpoint_heading = angle_to_left_right(rel_viewpoint_heading)
vp_description = rel_viewpoint_heading + f', {viewpoint_data["distance"]:.2f}m' # vp_description = rel_viewpoint_heading + f', {viewpoint_data["distance"]:.2f}m'
vp_description = rel_viewpoint_heading
vp_description = vp_description + f', {viewpoint_data["wall_distance"]:.2f}m to the wall'
# rel_range_idx = (vp_range_idx - range_idx) % 8 # rel_range_idx = (vp_range_idx - range_idx) % 8
candidate_range.setdefault(vp_range_idx, {}).update({viewpoint_id: vp_description}) candidate_range.setdefault(vp_range_idx, {}).update({viewpoint_id: vp_description})
@ -428,6 +498,8 @@ class NavGPTAgent(BaseAgent):
# Get current observation # Get current observation
cur_obs = self.env._get_obs()[0] cur_obs = self.env._get_obs()[0]
print(cur_obs)
# Get current feature # Get current feature
feature = cur_obs['obs'] feature = cur_obs['obs']
heading = np.rad2deg(cur_obs['heading']) heading = np.rad2deg(cur_obs['heading'])
@ -435,9 +507,14 @@ class NavGPTAgent(BaseAgent):
objects = cur_obs['objects'] objects = cur_obs['objects']
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}' orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
navigable = cur_obs['candidate'] navigable = cur_obs['candidate']
if self.config.use_relative_angle:
for vp, data in navigable.items():
data['wall_distance'] = distances[cur_obs['scan']][cur_obs['viewpoint']][vp]
print(data['wall_distance'])
if self.config.use_relative_angle: # True
feature = self.modify_heading_angles(heading, feature, navigable, objects) feature = self.modify_heading_angles(heading, feature, navigable, objects)
if self.config.use_navigable: if self.config.use_navigable: # False
navigable = self.get_navigable_str(heading, elevation, navigable) navigable = self.get_navigable_str(heading, elevation, navigable)
if self.config.use_tool_chain: if self.config.use_tool_chain:
@ -474,6 +551,11 @@ class NavGPTAgent(BaseAgent):
new_objects = new_obs['objects'] new_objects = new_obs['objects']
new_heading = np.rad2deg(new_obs['heading']) new_heading = np.rad2deg(new_obs['heading'])
new_elevation = np.rad2deg(new_obs['elevation']) new_elevation = np.rad2deg(new_obs['elevation'])
for vp, data in new_navigable.items():
data['wall_distance'] = distances[new_obs['scan']][new_obs['viewpoint']][vp]
print(data['wall_distance'])
if self.config.use_relative_angle: if self.config.use_relative_angle:
new_feature = self.modify_heading_angles(new_heading, new_feature, new_navigable, new_objects) new_feature = self.modify_heading_angles(new_heading, new_feature, new_navigable, new_objects)
new_orientation = f'\nheading: {new_heading:.2f}, elevation: {new_elevation:.2f}' new_orientation = f'\nheading: {new_heading:.2f}, elevation: {new_elevation:.2f}'
@ -556,6 +638,14 @@ class NavGPTAgent(BaseAgent):
heading = np.rad2deg(cur_obs['heading']) heading = np.rad2deg(cur_obs['heading'])
elevation = np.rad2deg(cur_obs['elevation']) elevation = np.rad2deg(cur_obs['elevation'])
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}' orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
for vp, data in navigable.items():
data['wall_distance'] = distances[cur_obs['scan']][cur_obs['viewpoint']][vp]
print(data['wall_distance'])
if self.config.use_relative_angle: if self.config.use_relative_angle:
feature = self.modify_heading_angles(heading, feature, navigable, objects) feature = self.modify_heading_angles(heading, feature, navigable, objects)
if self.config.use_navigable: if self.config.use_navigable:
@ -589,6 +679,12 @@ class NavGPTAgent(BaseAgent):
new_heading = np.rad2deg(new_obs['heading']) new_heading = np.rad2deg(new_obs['heading'])
new_elevation = np.rad2deg(new_obs['elevation']) new_elevation = np.rad2deg(new_obs['elevation'])
new_orientation = f'\nheading: {new_heading:.2f}, elevation: {new_elevation:.2f}' new_orientation = f'\nheading: {new_heading:.2f}, elevation: {new_elevation:.2f}'
for vp, data in new_navigable.items():
data['wall_distance'] = distances[new_obs['scan']][new_obs['viewpoint']][vp]
print(data['wall_distance'])
if self.config.use_relative_angle: if self.config.use_relative_angle:
new_feature = self.modify_heading_angles(new_heading, new_feature, new_navigable, new_objects) new_feature = self.modify_heading_angles(new_heading, new_feature, new_navigable, new_objects)
if self.config.use_navigable: if self.config.use_navigable:
@ -733,6 +829,10 @@ class NavGPTAgent(BaseAgent):
global STEPS_COUNTER global STEPS_COUNTER
global FINAL_STATE global FINAL_STATE
global NOW_LOCATION global NOW_LOCATION
global SCAN
global CLIP_TARGET
global LAST_VP
global FOUND_BBOX
FINAL_STOP_POINT = obs[0]['gt_path'][-1] FINAL_STOP_POINT = obs[0]['gt_path'][-1]
FINAL_STATE = "" FINAL_STATE = ""
@ -749,17 +849,23 @@ class NavGPTAgent(BaseAgent):
print(obs[0]['obs_summary']) print(obs[0]['obs_summary'])
print(obs[0]['objects']) print(obs[0]['objects'])
print(obs[0]['scan']) print(obs[0]['scan'])
print(obs[0]['viewpoint']) print('now:', obs[0]['viewpoint'])
print(obs[0]['heading']) print(obs[0]['heading'])
print(obs[0]['elevation']) print(obs[0]['elevation'])
print(obs[0]['candidate']) print(obs[0]['candidate'])
print(obs[0]['instruction']) print(obs[0]['instruction'])
print(obs[0]['gt_path']) print('path:', obs[0]['gt_path'])
print(obs[0]['path_id']) print(obs[0]['path_id'])
print(obs[0]['start']) print('start:', obs[0]['start'])
print(obs[0]['target']) print(obs[0]['target'])
print(obs[0]['new_reverie_id']) print(obs[0]['new_reverie_id'])
print(obs[0]['clip_target'])
NOW_LOCATION = obs[0]['start'] NOW_LOCATION = obs[0]['start']
CLIP_TARGET = obs[0]['clip_target']
SCAN = obs[0]['scan']
LAST_VP = ""
FOUND_BBOX = ""
print("==") print("==")
@ -809,11 +915,16 @@ class NavGPTAgent(BaseAgent):
# we are HERE # we are HERE
feature = init_ob['obs'] feature = init_ob['obs']
navigable = init_ob['candidate'] navigable = init_ob['candidate']
# distances =
objects = init_ob['objects'] objects = init_ob['objects']
heading = np.rad2deg(init_ob['heading']) heading = np.rad2deg(init_ob['heading'])
elevation = np.rad2deg(init_ob['elevation']) elevation = np.rad2deg(init_ob['elevation'])
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}' orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
for vp, data in navigable.items():
data['wall_distance'] = distances[init_ob['scan']][init_ob['viewpoint']][vp]
print(data['wall_distance'])
print("use_relative_angle:", self.config.use_relative_angle) print("use_relative_angle:", self.config.use_relative_angle)
print("use_relative_angle:", self.config.use_navigable) print("use_relative_angle:", self.config.use_navigable)
if self.config.use_relative_angle: # True if self.config.use_relative_angle: # True
@ -838,6 +949,9 @@ class NavGPTAgent(BaseAgent):
'init_observation': init_observation, # 8 direction's observation caption & navigable point & objects 'init_observation': init_observation, # 8 direction's observation caption & navigable point & objects
} }
output = self.agent_executor(input) output = self.agent_executor(input)
if LAST_VP != "":
turned_angle, new_obs = self.make_equiv_action([LAST_VP])
if 'stop' in FINAL_STATE: if 'stop' in FINAL_STATE:
self.traj[i]['final_state'] = 'stop' self.traj[i]['final_state'] = 'stop'
@ -846,6 +960,8 @@ class NavGPTAgent(BaseAgent):
self.traj[i]['llm_output'] = output['output'] self.traj[i]['llm_output'] = output['output']
self.traj[i]['action_plan'] = output['action_plan'] self.traj[i]['action_plan'] = output['action_plan']
self.traj[i]['bbox'] = FOUND_BBOX
# extract agent's thought from llm output # extract agent's thought from llm output
intermediate_steps = output['intermediate_steps'] intermediate_steps = output['intermediate_steps']
self.traj[i]['llm_thought'] = [] self.traj[i]['llm_thought'] = []
@ -855,4 +971,7 @@ class NavGPTAgent(BaseAgent):
self.traj[i]['llm_thought'].append(thought) self.traj[i]['llm_thought'].append(thought)
self.traj[i]['llm_observation'].append(observation) self.traj[i]['llm_observation'].append(observation)
print("TRAJ: {}".format(self.traj[0]['path']))
print(f"status={FINAL_STATE}, FOUND_BBOX={FOUND_BBOX}")
print()
return self.traj return self.traj

View File

@ -45,3 +45,8 @@ def construct_reverie_instrs(anno_dir, dataset, splits):
del new_item['instr_encodings'] del new_item['instr_encodings']
data.append(new_item) data.append(new_item)
return data return data
def load_json(f):
with open(f) as fp:
data = json.load(fp)
return data

View File

@ -14,6 +14,17 @@ from utils.graph_utils import NavGraph
ERROR_MARGIN = 3.0 ERROR_MARGIN = 3.0
obj2vps = {}
bbox_data = json.load(open('/data/Matterport3DSimulator-duet/VLN-DUET/datasets/REVERIE/annotations/BBoxes.json'))
for scanvp, value in bbox_data.items():
scan, vp = scanvp.split('_')
# for all visible objects at that viewpoint
for objid, objinfo in value.items():
if objinfo['visible_pos']:
# if such object not already in the dict
obj2vps.setdefault(scan+'_'+objid, [])
obj2vps[scan+'_'+objid].append(vp)
def load_floorplan(): def load_floorplan():
region_label_lookup = load_region_label_lookup() region_label_lookup = load_region_label_lookup()
@ -118,6 +129,8 @@ def load_region_label_lookup():
} }
return region_label_lookup return region_label_lookup
with open('./node_region.json') as fp:
node_region = json.load(fp)
class Simulator(object): class Simulator(object):
''' A simple simulator in Matterport3D environment ''' ''' A simple simulator in Matterport3D environment '''
@ -144,7 +157,8 @@ class Simulator(object):
heading: int, heading: int,
elevation: int, elevation: int,
start: str, start: str,
target: str target: str,
clip_target: str,
): ):
self.heading = heading self.heading = heading
self.elevation = elevation self.elevation = elevation
@ -152,11 +166,13 @@ class Simulator(object):
self.viewpoint_ID = viewpoint_ID self.viewpoint_ID = viewpoint_ID
self.start = start self.start = start
self.target = target self.target = target
self.clip_target = clip_target
# Load navigable dict # Load navigable dict
navigable_path = os.path.join(self.navigable_dir, self.scan_ID + '_navigable.json') navigable_path = os.path.join(self.navigable_dir, self.scan_ID + '_navigable.json')
with open(navigable_path, 'r') as f: with open(navigable_path, 'r') as f:
navigable_dict = json.load(f) self.navigable_dict = json.load(f)
'''
self.navigable_dict = {} self.navigable_dict = {}
for start, v in navigable_dict.items(): for start, v in navigable_dict.items():
self.navigable_dict[start] = {} self.navigable_dict[start] = {}
@ -168,6 +184,7 @@ class Simulator(object):
self.navigable_dict[start][to] = _v self.navigable_dict[start][to] = _v
# print(start_region, to_region) # print(start_region, to_region)
# print("AFTER: ", len(self.navigable_dict[start])) # print("AFTER: ", len(self.navigable_dict[start]))
'''
# Get candidate # Get candidate
self.getCandidate() self.getCandidate()
@ -185,7 +202,8 @@ class Simulator(object):
'elevation': self.elevation, 'elevation': self.elevation,
'candidate': self.candidate, 'candidate': self.candidate,
'start': self.start, 'start': self.start,
'target': self.target 'target': self.target,
'clip_target': self.clip_target,
} }
return self.state return self.state
@ -230,9 +248,9 @@ class EnvBatch(object):
def _make_id(self, scanId, viewpointId): def _make_id(self, scanId, viewpointId):
return scanId + '_' + viewpointId return scanId + '_' + viewpointId
def newEpisodes(self, scanIds, viewpointIds, headings, starts, targets): def newEpisodes(self, scanIds, viewpointIds, headings, starts, targets, clip_targets):
for i, (scanId, viewpointId, heading, start, target) in enumerate(zip(scanIds, viewpointIds, headings, starts, targets)): for i, (scanId, viewpointId, heading, start, target, clip_target) in enumerate(zip(scanIds, viewpointIds, headings, starts, targets, clip_targets)):
self.sims[i].newEpisode(scanId, viewpointId, heading, 0, start, target) self.sims[i].newEpisode(scanId, viewpointId, heading, 0, start, target, clip_target)
def getStates(self): def getStates(self):
""" """
@ -358,7 +376,8 @@ class REVERIENavBatch(object):
'path_id' : item['path_id'], 'path_id' : item['path_id'],
'start': item['start'], 'start': item['start'],
'new_reverie_id': item['new_reverie_id'], 'new_reverie_id': item['new_reverie_id'],
'target': item['target'] 'target': item['target'],
'clip_target': item['clip_target']
} }
# RL reward. The negative distance between the state and the final state # RL reward. The negative distance between the state and the final state
# There are multiple gt end viewpoints on REVERIE. # There are multiple gt end viewpoints on REVERIE.
@ -382,7 +401,8 @@ class REVERIENavBatch(object):
headings = [item['heading'] for item in self.batch] headings = [item['heading'] for item in self.batch]
starts = [item['start'] for item in self.batch] starts = [item['start'] for item in self.batch]
targets = [item['target'] for item in self.batch] targets = [item['target'] for item in self.batch]
self.env.newEpisodes(scanIds, starts, headings, starts, targets) clip_targets = [item['clip_target'] for item in self.batch]
self.env.newEpisodes(scanIds, starts, headings, starts, targets, clip_targets)
return self._get_obs() return self._get_obs()
def step(self, next_viewpoint_IDs): def step(self, next_viewpoint_IDs):
@ -401,7 +421,7 @@ class REVERIENavBatch(object):
near_d = d near_d = d
return near_id return near_id
def _eval_item(self, scan, pred_path, gt_path): def _eval_item(self, scan, pred_path, gt_path, gt_found, found, gt_objid):
scores = {} scores = {}
shortest_distances = self.shortest_distances[scan] shortest_distances = self.shortest_distances[scan]
@ -420,9 +440,27 @@ class REVERIENavBatch(object):
gt_lengths = np.sum([shortest_distances[a][b] for a, b in zip(gt_path[:-1], gt_path[1:])]) gt_lengths = np.sum([shortest_distances[a][b] for a, b in zip(gt_path[:-1], gt_path[1:])])
scores['success'] = float(scores['nav_error'] < ERROR_MARGIN) scores['found_success'] = float(gt_found == found)
# scores['spl'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01)
scores['oracle_success'] = float(scores['oracle_error'] < ERROR_MARGIN) goal_viewpoints = set(obj2vps['%s_%s'%(scan, str(gt_objid))])
pred_stop_region = node_region[scan][path[-1]]
gt_stop_region = node_region[scan][gt_path[-1]]
# scores['success'] = float(scores['nav_error'] < ERROR_MARGIN)
scores['success'] = float(path[-1] in goal_viewpoints)
scores['room_success'] = float(gt_stop_region == pred_stop_region)
# scores['oracle_success'] = float(scores['oracle_error'] < ERROR_MARGIN)
scores['oracle_success'] = float(any(x in goal_viewpoints for x in path))
scores['spl'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01)
scores['sspl_1'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
scores['sspl_2'] = scores['room_success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
scores['sspl_3'] = scores['oracle_success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
scores['ss_1'] = scores['success'] * scores['found_success']
scores['ss_2'] = scores['room_success'] * scores['found_success']
scores['ss_3'] = scores['oracle_success'] * scores['found_success']
scores.update( scores.update(
cal_dtw(shortest_distances, path, gt_path, scores['success'], ERROR_MARGIN) cal_dtw(shortest_distances, path, gt_path, scores['success'], ERROR_MARGIN)
@ -440,8 +478,9 @@ class REVERIENavBatch(object):
for item in preds: for item in preds:
instr_id = item['instr_id'] instr_id = item['instr_id']
traj = item['trajectory'] traj = item['trajectory']
obj_id = instr_id.split('_')[1]
scan, gt_traj = self.gt_trajs[instr_id] scan, gt_traj = self.gt_trajs[instr_id]
traj_scores = self._eval_item(scan, traj, gt_traj) traj_scores = self._eval_item(scan, traj, gt_traj, item['gt_found'], item['found'], obj_id)
for k, v in traj_scores.items(): for k, v in traj_scores.items():
metrics[k].append(v) metrics[k].append(v)
metrics['instr_id'].append(instr_id) metrics['instr_id'].append(instr_id)
@ -453,8 +492,16 @@ class REVERIENavBatch(object):
'nav_error': np.mean(metrics['nav_error']), 'nav_error': np.mean(metrics['nav_error']),
'oracle_error': np.mean(metrics['oracle_error']), 'oracle_error': np.mean(metrics['oracle_error']),
'sr': np.mean(metrics['success']) * 100, 'sr': np.mean(metrics['success']) * 100,
'room_success': np.mean(metrics['room_success']) * 100,
'found_success': np.mean(metrics['found_success']) * 100,
'oracle_sr': np.mean(metrics['oracle_success']) * 100, 'oracle_sr': np.mean(metrics['oracle_success']) * 100,
# 'spl': np.mean(metrics['spl']) * 100, 'spl': np.mean(metrics['spl']) * 100,
'sspl_1': np.mean(metrics['sspl_1']) * 100,
'sspl_2': np.mean(metrics['sspl_2']) * 100,
'sspl_3': np.mean(metrics['sspl_3']) * 100,
'ss_1': np.mean(metrics['ss_1']) * 100,
'ss_2': np.mean(metrics['ss_2']) * 100,
'ss_3': np.mean(metrics['ss_3']) * 100,
'nDTW': np.mean(metrics['nDTW']) * 100, 'nDTW': np.mean(metrics['nDTW']) * 100,
'SDTW': np.mean(metrics['SDTW']) * 100, 'SDTW': np.mean(metrics['SDTW']) * 100,
'CLS': np.mean(metrics['CLS']) * 100, 'CLS': np.mean(metrics['CLS']) * 100,

View File

@ -244,16 +244,29 @@ Instruction: {action_plan}
Initial Observation: {init_observation} Initial Observation: {init_observation}
Thought: I should start navigation according to the instruction, {agent_scratchpad}""" Thought: I should start navigation according to the instruction, {agent_scratchpad}"""
VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an indoor environment to reach a target viewpoint based on a given instruction, performing the Vision and Language Navigation (VLN) task. The instruction may be either feasible or infeasible (i.e., the specified object might not be found in the environment). You will move among static positions within a pre-defined graph, aiming for the nearest position to the object if the object is present. VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate in an indoor environment to reach a target viewpoint to find the object based on a given instruction, performing the Vision and Language Navigation (VLN) task.
The instruction will let you find all the target objects in a room. You should have a good stratedy to check all the object in the shortest path in the room.
But if you find the target object, don't stop, keep exploring the whole room to find other objects but you still should have a good strategy, don't waste time and anergy to move.
You will move among static positions within a pre-defined graph, aiming for the nearest position to the object if the object is present.
You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward. You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.
Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs. Reach the instructed destination as closely as possible. The task will fail if you do not reach within 3 meters of the instructed destination, even if it is observable. Therefore, if the destination is visible but you do not see the object within 3 meters, move closer. And we will calculate how many meters extend in the direction of each viewpoint before hitting a wall. We hope this distance information can help you understand the spatial layout of the room. Please plan an effective exploration strategy based on this distance information.
At each step, determine if you've reached the destination(If the object is more than three meters away from you, you are not considered to have reached the destination).
If yes, stop and output 'Final Answer: Finished!'. For example, if I have 2 viewpoints to choose (A: 1m, B: 5m) but I cannot find the target object so I better choose viewpoint B because I may have more exploration space to find the target.
If not, continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
And if you explored all room, you think this object doesn't exist in this room. stop and output 'Final Answer: Not found!'.
If you find another room seems to be the closest match to the instruction but no viewpoint can access to this room. Please output "Final Answer: Not found!" Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs and the most important thing is that you should not leave the room so you better not move closed to the door.
Notice: You should have a good strategy to check whether the target object exists in this room, and stop when you exploring all viewpoint in this room.
If you think you are moving in circles, please stop and think whether any other objects may be hiden. If no, please output 'Final Answer: Not found'.
Continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
And if you explored all room(no other viewpoint to move to), stop and output 'Final Answer: Not found!'.
Show your reasoning in the Thought section. Show your reasoning in the Thought section.
Follow the given format and use provided tools. Follow the given format and use provided tools.
@ -261,22 +274,22 @@ Follow the given format and use provided tools.
Do not fabricate nonexistent viewpoint IDs. Do not fabricate nonexistent viewpoint IDs.
---- ----
Starting below, you should follow this format: Starting below, you should follow this format, do not use other format:
Instruction: the instruction describing the whole trajectory Instruction: the instruction describing the whole trajectory
Initial Observation: the initial observation of the environment Initial Observation: the initial observation of the environment
Thought: you should always think about what to do next and why Thought: you should always think about what to do next and why
Action: the action to take, must be one of the tools [{tool_names}] Action: the action to take, must be one of the tools [{tool_names}]
Action Input: "Viewpoint ID" Action Input: "Viewpoint ID", you should not choose object name or others, please only output "Viewpoint ID"
Observation: the result of the action Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times) ... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I have reached the destination, I can stop. Thought: I found my target object, but I should check whether any other objects may be hidden.
Final Answer: Finished!
or or
Thought: I cannot find the object in this room, I should stop. Thought: I checked that no objects are hidden, I can stop.
Final Answer: Not found! Final Answer: Not found!
---- ----
Begin! Begin!