Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4a373bc0b8 | |||
| d2f7dc0798 | |||
| edc6d6f198 | |||
| 7e5abbca0e | |||
| d00b84432e | |||
| bc6cb9a9f8 | |||
| 1547974692 | |||
| cd2e0a30e4 | |||
| a85950f06f | |||
| 5cbd75711e | |||
| 64fbce018a | |||
| 68330c5163 |
@ -11,7 +11,8 @@ def dump_json(data, filename):
|
||||
json.dump(data, fp)
|
||||
|
||||
for f in os.listdir():
|
||||
if 'json' in f:
|
||||
if 'unseen' in f:
|
||||
print(f)
|
||||
data = load_json(f)
|
||||
|
||||
new_data = []
|
||||
@ -19,7 +20,13 @@ for f in os.listdir():
|
||||
for index, instr in enumerate(i['instructions']):
|
||||
new_i = i.copy()
|
||||
new_i['instruction'] = instr
|
||||
new_i['instr_id'] = f'{new_i["id"]}_{index}'
|
||||
# new_i['instr_id'] = f'{new_i["id"]}_{index}'
|
||||
new_i['new_reverie_id'] = f'{new_i["id"]}_{index}'
|
||||
new_i['gt_found'] = i['found'][index]
|
||||
new_i['target'] = i['target_objects'][index]
|
||||
new_i['clip_target'] = i['clip_target'][index]
|
||||
del new_i['found']
|
||||
del new_i['target_objects']
|
||||
del new_i['instructions']
|
||||
|
||||
new_data.append(new_i)
|
||||
|
||||
@ -10,7 +10,7 @@ from parser import parse_args
|
||||
from env import REVERIENavBatch
|
||||
from agent import NavGPTAgent
|
||||
|
||||
def build_dataset(args, data_limit=100):
|
||||
def build_dataset(args):
|
||||
|
||||
feat_db = ImageObservationsDB(args.obs_dir, args.obs_summary_dir, args.obj_dir)
|
||||
print(feat_db)
|
||||
@ -26,7 +26,7 @@ def build_dataset(args, data_limit=100):
|
||||
)
|
||||
val_env = dataset_class(
|
||||
feat_db, val_instr_data, args.connectivity_dir, args.navigable_dir,
|
||||
batch_size=args.batch_size, seed=args.seed, name=split, data_limit=data_limit
|
||||
batch_size=args.batch_size, seed=args.seed, name=split
|
||||
) # evaluation using all objects
|
||||
val_envs[split] = val_env
|
||||
|
||||
@ -62,7 +62,14 @@ def valid(args, val_envs):
|
||||
open(os.path.join(args.log_dir, "detail_%s.json" % (env_name)), 'w'),
|
||||
sort_keys=True, indent=4, separators=(',', ': ')
|
||||
)
|
||||
print(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)))
|
||||
json.dump(
|
||||
preds,
|
||||
open(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)), 'w'),
|
||||
sort_keys=True, indent=4, separators=(',', ': ')
|
||||
)
|
||||
|
||||
'''
|
||||
if 'test' not in env_name:
|
||||
score_summary, _ = env.eval_metrics(preds)
|
||||
loss_str = "Env name: %s" % env_name
|
||||
@ -70,11 +77,7 @@ def valid(args, val_envs):
|
||||
loss_str += ', %s: %.2f' % (metric, val)
|
||||
write_to_record_file(loss_str+'\n', record_file)
|
||||
|
||||
json.dump(
|
||||
preds,
|
||||
open(os.path.join(args.pred_dir, "%s_%s.json" % (prefix, env_name)), 'w'),
|
||||
sort_keys=True, indent=4, separators=(',', ': ')
|
||||
)
|
||||
'''
|
||||
|
||||
|
||||
def valid_from_file(args, val_envs):
|
||||
@ -96,7 +99,7 @@ def valid_from_file(args, val_envs):
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
val_envs = build_dataset(args, data_limit=100)
|
||||
val_envs = build_dataset(args)
|
||||
|
||||
if args.valid_file is not None:
|
||||
valid_from_file(args, val_envs)
|
||||
|
||||
189
nav_src/agent.py
189
nav_src/agent.py
@ -26,6 +26,8 @@ from langchain.schema import (
|
||||
)
|
||||
from langchain.base_language import BaseLanguageModel
|
||||
|
||||
from data_utils import load_json
|
||||
|
||||
from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS
|
||||
from prompt.planner_prompt import (
|
||||
ACTION_PROMPT,
|
||||
@ -45,10 +47,18 @@ FINAL_ANSWER_ACTION = "Final Answer:"
|
||||
EXCEPTION_TOOL_NAME = "_Exception"
|
||||
MAX_SCRATCHPAD_LENGTH = 7000
|
||||
|
||||
CLIP_TARGET = ""
|
||||
FINAL_STOP_POINT = ""
|
||||
FINAL_STATE = ""
|
||||
SUCCESS = 0
|
||||
TEMP_STEPS_COUNTER = 0
|
||||
STEPS_COUNTER = 0
|
||||
NOW_LOCATION = None
|
||||
FOUND_BBOX = ""
|
||||
LAST_VP = ""
|
||||
|
||||
THRESHOLD = 0.75
|
||||
SCAN = ""
|
||||
|
||||
MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
|
||||
"Invalid Format: Missing 'Action:' after 'Thought:"
|
||||
@ -60,6 +70,32 @@ FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE = (
|
||||
"Parsing LLM output produced both a final answer and a parse-able action:"
|
||||
)
|
||||
|
||||
print("Load GroundingDINO confidence file...")
|
||||
confidences = load_json('../datasets/REVERIE/annotations/grounding_dino_confidence.json')
|
||||
print("Loaded")
|
||||
'''
|
||||
print("Load CLIP confidence file...")
|
||||
confidences = load_json('../datasets/REVERIE/annotations/confidence.json')
|
||||
print("Loaded")
|
||||
'''
|
||||
print()
|
||||
|
||||
print("Load distance file...")
|
||||
distances = {}
|
||||
for SCAN in ['2azQ1b91cZZ', 'X7HyMhZNoso', 'Z6MFQCViBuw', 'TbHJrupSAjP', 'EU6Fwq7SyZv', 'zsNo4HB9uLZ', 'x8F5xyUWy9e', '8194nk5LbLH', 'oLBMNvg9in8', 'QUCTc6BB5sX']:
|
||||
scan_distance = load_json('/data/base_dir/v1/scans/{}/output.json'.format(SCAN))
|
||||
distances[SCAN] = scan_distance
|
||||
print("Loaded")
|
||||
print()
|
||||
|
||||
def is_found(scan, vp, clip_target):
|
||||
found = False
|
||||
for obj in confidences[scan][vp]:
|
||||
prob = confidences[scan][vp][obj][clip_target]
|
||||
|
||||
if prob >= THRESHOLD:
|
||||
found = True
|
||||
return found
|
||||
|
||||
class NavGPTOutputParser(AgentOutputParser):
|
||||
"""MRKL Output parser for the chat agent."""
|
||||
@ -71,19 +107,47 @@ class NavGPTOutputParser(AgentOutputParser):
|
||||
global STEPS_COUNTER
|
||||
global TEMP_STEPS_COUNTER
|
||||
global SUCCESS
|
||||
# includes_answer = FINAL_ANSWER_ACTION in text
|
||||
global NOW_LOCATION
|
||||
global FINAL_STATE
|
||||
global CLIP_TARGET
|
||||
global SCAN
|
||||
global LAST_VP
|
||||
global FOUND_BBOX
|
||||
includes_answer = FINAL_ANSWER_ACTION in text
|
||||
regex = (
|
||||
r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*\"?([a-fA-F0-9]{32})\"?"
|
||||
)
|
||||
action_match = re.search(regex, text, re.DOTALL)
|
||||
if action_match:
|
||||
# if includes_answer:
|
||||
# raise OutputParserException(
|
||||
# f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
|
||||
# )
|
||||
if includes_answer:
|
||||
raise OutputParserException(
|
||||
f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
|
||||
)
|
||||
action = action_match.group(1).strip()
|
||||
action_input = action_match.group(2)
|
||||
tool_input = action_input.strip(" ")
|
||||
|
||||
|
||||
# confidence to stop
|
||||
if tool_input in confidences[SCAN]:
|
||||
found = False
|
||||
max_bbox, max_bbox_confidence = "", 0
|
||||
|
||||
for bbox in confidences[SCAN][tool_input][CLIP_TARGET]:
|
||||
confidence = bbox['score']
|
||||
if confidence >= THRESHOLD and confidence >= max_bbox_confidence:
|
||||
max_bbox = bbox
|
||||
max_bbox_confidence = confidence
|
||||
FOUND_BBOX = bbox
|
||||
found = True
|
||||
if found:
|
||||
FINAL_STATE = 'stop'
|
||||
LAST_VP = tool_input
|
||||
print("=============== FOUND OBJECT IN CLIP ===================")
|
||||
return AgentFinish(
|
||||
{"output": tool_input}, text
|
||||
)
|
||||
|
||||
# ensure if its a well formed SQL query we don't remove any trailing " chars
|
||||
if tool_input.startswith("SELECT ") is False:
|
||||
tool_input = tool_input.strip('"')
|
||||
@ -92,13 +156,21 @@ class NavGPTOutputParser(AgentOutputParser):
|
||||
print("ACTION: ", action_input)
|
||||
print(f"MY FINAL_STOP_POINT = {FINAL_STOP_POINT}")
|
||||
|
||||
TEMP_STEPS_COUNTER += 1
|
||||
# TEMP_STEPS_COUNTER += 1
|
||||
'''
|
||||
print(f"TEMP_STEPS_COUNT = {TEMP_STEPS_COUNTER}")
|
||||
print(f"STEPS_COUNT = {STEPS_COUNTER}")
|
||||
print(f"SUCCESS = {SUCCESS}")
|
||||
'''
|
||||
|
||||
NOW_LOCATION = tool_input
|
||||
TEMP_STEPS_COUNTER += 1
|
||||
print(f"NOW_LOCATION = {NOW_LOCATION}")
|
||||
|
||||
print(f'ACTION={action}, TOOL_INPUT={tool_input}, TEXT={text}')
|
||||
|
||||
|
||||
|
||||
'''
|
||||
if FINAL_STOP_POINT in text:
|
||||
STEPS_COUNTER += TEMP_STEPS_COUNTER
|
||||
SUCCESS += 1
|
||||
@ -110,14 +182,38 @@ class NavGPTOutputParser(AgentOutputParser):
|
||||
return AgentFinish(
|
||||
{"output": action_input}, text
|
||||
)
|
||||
'''
|
||||
|
||||
return AgentAction(action, tool_input, text)
|
||||
'''
|
||||
elif includes_answer:
|
||||
# is_STOP = 'Finished' in text
|
||||
# print("FINAL: ", is_STOP)
|
||||
|
||||
'''
|
||||
if is_STOP:
|
||||
FINAL_STATE = 'stop'
|
||||
else:
|
||||
FINAL_STATE = 'not found'
|
||||
'''
|
||||
|
||||
'''
|
||||
if NOW_LOCATION == FINAL_STOP_POINT:
|
||||
STEPS_COUNTER += TEMP_STEPS_COUNTER
|
||||
TEMP_STEPS_COUNTER = 0
|
||||
SUCCESS += 1
|
||||
print(f"SUCCESS = {SUCCESS}")
|
||||
else:
|
||||
|
||||
print("NOT SUCCESS")
|
||||
print(f"{NOW_LOCATION}_{type(NOW_LOCATION)}")
|
||||
print(f"{FINAL_STOP_POINT}_{type(FINAL_STOP_POINT)}")
|
||||
print(f"SUCCESS = {SUCCESS}")
|
||||
print(f"STEPS_COUNTER = {STEPS_COUNTER}")
|
||||
'''
|
||||
FINAL_STATE = 'not found'
|
||||
return AgentFinish(
|
||||
{"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
|
||||
)
|
||||
'''
|
||||
|
||||
if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL):
|
||||
raise OutputParserException(
|
||||
@ -364,7 +460,7 @@ class NavGPTAgent(BaseAgent):
|
||||
"""Initialize the trajectory with the given observation."""
|
||||
# Record the navigation path
|
||||
self.traj = [{
|
||||
'instr_id': ob['instr_id'],
|
||||
'instr_id': ob['new_reverie_id'],
|
||||
'path': [[ob['start']]],
|
||||
'details': [],
|
||||
} for ob in obs]
|
||||
@ -400,6 +496,8 @@ class NavGPTAgent(BaseAgent):
|
||||
# Get current observation
|
||||
cur_obs = self.env._get_obs()[0]
|
||||
|
||||
print(cur_obs)
|
||||
|
||||
# Get current feature
|
||||
feature = cur_obs['obs']
|
||||
heading = np.rad2deg(cur_obs['heading'])
|
||||
@ -407,9 +505,11 @@ class NavGPTAgent(BaseAgent):
|
||||
objects = cur_obs['objects']
|
||||
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
|
||||
navigable = cur_obs['candidate']
|
||||
if self.config.use_relative_angle:
|
||||
|
||||
|
||||
if self.config.use_relative_angle: # True
|
||||
feature = self.modify_heading_angles(heading, feature, navigable, objects)
|
||||
if self.config.use_navigable:
|
||||
if self.config.use_navigable: # False
|
||||
navigable = self.get_navigable_str(heading, elevation, navigable)
|
||||
|
||||
if self.config.use_tool_chain:
|
||||
@ -446,6 +546,8 @@ class NavGPTAgent(BaseAgent):
|
||||
new_objects = new_obs['objects']
|
||||
new_heading = np.rad2deg(new_obs['heading'])
|
||||
new_elevation = np.rad2deg(new_obs['elevation'])
|
||||
|
||||
|
||||
if self.config.use_relative_angle:
|
||||
new_feature = self.modify_heading_angles(new_heading, new_feature, new_navigable, new_objects)
|
||||
new_orientation = f'\nheading: {new_heading:.2f}, elevation: {new_elevation:.2f}'
|
||||
@ -528,6 +630,11 @@ class NavGPTAgent(BaseAgent):
|
||||
heading = np.rad2deg(cur_obs['heading'])
|
||||
elevation = np.rad2deg(cur_obs['elevation'])
|
||||
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if self.config.use_relative_angle:
|
||||
feature = self.modify_heading_angles(heading, feature, navigable, objects)
|
||||
if self.config.use_navigable:
|
||||
@ -561,6 +668,9 @@ class NavGPTAgent(BaseAgent):
|
||||
new_heading = np.rad2deg(new_obs['heading'])
|
||||
new_elevation = np.rad2deg(new_obs['elevation'])
|
||||
new_orientation = f'\nheading: {new_heading:.2f}, elevation: {new_elevation:.2f}'
|
||||
|
||||
|
||||
|
||||
if self.config.use_relative_angle:
|
||||
new_feature = self.modify_heading_angles(new_heading, new_feature, new_navigable, new_objects)
|
||||
if self.config.use_navigable:
|
||||
@ -604,7 +714,7 @@ class NavGPTAgent(BaseAgent):
|
||||
|
||||
tools = [
|
||||
self.action_maker,
|
||||
self.back_tracer
|
||||
self.back_tracer,
|
||||
]
|
||||
|
||||
if self.config.use_tool_chain:
|
||||
@ -682,7 +792,10 @@ class NavGPTAgent(BaseAgent):
|
||||
new_obs = self.env.step(actions)[0]
|
||||
new_heading = np.rad2deg(new_obs['heading'])
|
||||
# Record the trajectory
|
||||
self.traj[0]['path'].append(self.env.env.sims[0].gmap.bfs_shortest_path(cur_obs['viewpoint'], actions[0])[1:])
|
||||
try:
|
||||
self.traj[0]['path'].append(self.env.env.sims[0].gmap.bfs_shortest_path(cur_obs['viewpoint'], actions[0])[1:])
|
||||
except:
|
||||
None
|
||||
# Calculate the turned angle
|
||||
turned_angle = new_heading - cur_heading
|
||||
# Generate action description
|
||||
@ -699,8 +812,16 @@ class NavGPTAgent(BaseAgent):
|
||||
|
||||
global FINAL_STOP_POINT
|
||||
global TEMP_STEPS_COUNTER
|
||||
global STEPS_COUNTER
|
||||
global FINAL_STATE
|
||||
global NOW_LOCATION
|
||||
global SCAN
|
||||
global CLIP_TARGET
|
||||
global LAST_VP
|
||||
global FOUND_BBOX
|
||||
|
||||
FINAL_STOP_POINT = obs[0]['stop']
|
||||
FINAL_STOP_POINT = obs[0]['gt_path'][-1]
|
||||
FINAL_STATE = ""
|
||||
|
||||
if TEMP_STEPS_COUNTER != 0:
|
||||
TEMP_STEPS_COUNTER = 0
|
||||
@ -713,18 +834,24 @@ class NavGPTAgent(BaseAgent):
|
||||
print(obs[0]['obs'])
|
||||
print(obs[0]['obs_summary'])
|
||||
print(obs[0]['objects'])
|
||||
print(obs[0]['instr_id'])
|
||||
print(obs[0]['scan'])
|
||||
print(obs[0]['viewpoint'])
|
||||
print('now:', obs[0]['viewpoint'])
|
||||
print(obs[0]['heading'])
|
||||
print(obs[0]['elevation'])
|
||||
print(obs[0]['candidate'])
|
||||
print(obs[0]['instruction'])
|
||||
print(obs[0]['gt_path'])
|
||||
print('path:', obs[0]['gt_path'])
|
||||
print(obs[0]['path_id'])
|
||||
print(obs[0]['stop'])
|
||||
print(obs[0]['start'])
|
||||
print('start:', obs[0]['start'])
|
||||
print(obs[0]['target'])
|
||||
print(obs[0]['new_reverie_id'])
|
||||
print(obs[0]['clip_target'])
|
||||
NOW_LOCATION = obs[0]['start']
|
||||
CLIP_TARGET = obs[0]['clip_target']
|
||||
SCAN = obs[0]['scan']
|
||||
LAST_VP = ""
|
||||
FOUND_BBOX = ""
|
||||
|
||||
|
||||
|
||||
print("==")
|
||||
@ -733,7 +860,7 @@ class NavGPTAgent(BaseAgent):
|
||||
self.init_trajecotry(obs)
|
||||
|
||||
# Load the instruction
|
||||
# instructions = [ob['instruction'] for ob in obs]
|
||||
instructions = [ob['instruction'] for ob in obs]
|
||||
targets = [ob['target'] for ob in obs]
|
||||
|
||||
|
||||
@ -741,8 +868,8 @@ class NavGPTAgent(BaseAgent):
|
||||
print(self.config.load_action_plan)
|
||||
|
||||
if self.config.load_instruction:
|
||||
# action_plans = instructions
|
||||
action_plans = targets
|
||||
action_plans = instructions
|
||||
# action_plans = targets
|
||||
elif self.config.load_action_plan:
|
||||
action_plans = [ob['action_plan'] for ob in obs]
|
||||
else:
|
||||
@ -774,11 +901,13 @@ class NavGPTAgent(BaseAgent):
|
||||
# we are HERE
|
||||
feature = init_ob['obs']
|
||||
navigable = init_ob['candidate']
|
||||
# distances =
|
||||
objects = init_ob['objects']
|
||||
heading = np.rad2deg(init_ob['heading'])
|
||||
elevation = np.rad2deg(init_ob['elevation'])
|
||||
orientation = f'\nheading: {heading:.2f}, elevation: {elevation:.2f}'
|
||||
|
||||
|
||||
print("use_relative_angle:", self.config.use_relative_angle)
|
||||
print("use_relative_angle:", self.config.use_navigable)
|
||||
if self.config.use_relative_angle: # True
|
||||
@ -803,9 +932,19 @@ class NavGPTAgent(BaseAgent):
|
||||
'init_observation': init_observation, # 8 direction's observation caption & navigable point & objects
|
||||
}
|
||||
output = self.agent_executor(input)
|
||||
if LAST_VP != "":
|
||||
turned_angle, new_obs = self.make_equiv_action([LAST_VP])
|
||||
|
||||
|
||||
if 'stop' in FINAL_STATE:
|
||||
self.traj[i]['final_state'] = 'stop'
|
||||
else:
|
||||
self.traj[i]['final_state'] = 'not found'
|
||||
|
||||
self.traj[i]['llm_output'] = output['output']
|
||||
self.traj[i]['action_plan'] = output['action_plan']
|
||||
self.traj[i]['bbox'] = FOUND_BBOX
|
||||
|
||||
# extract agent's thought from llm output
|
||||
intermediate_steps = output['intermediate_steps']
|
||||
self.traj[i]['llm_thought'] = []
|
||||
@ -815,5 +954,7 @@ class NavGPTAgent(BaseAgent):
|
||||
self.traj[i]['llm_thought'].append(thought)
|
||||
self.traj[i]['llm_observation'].append(observation)
|
||||
|
||||
print("TRAJ: {}".format(self.traj[0]['path']))
|
||||
print(f"status={FINAL_STATE}, FOUND_BBOX={FOUND_BBOX}")
|
||||
print()
|
||||
return self.traj
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ class BaseAgent(object):
|
||||
output[-1]['llm_output'] = v['llm_output']
|
||||
output[-1]['llm_thought'] = v['llm_thought']
|
||||
output[-1]['llm_observation'] = v['llm_observation']
|
||||
output[-1]['final_state'] = v['final_state']
|
||||
return output
|
||||
|
||||
def rollout(self, **args):
|
||||
@ -50,6 +51,8 @@ class BaseAgent(object):
|
||||
else: # Do a full round
|
||||
while True:
|
||||
for traj in self.rollout(**kwargs):
|
||||
print(f"ID: {traj['instr_id']}")
|
||||
print(self.results.keys())
|
||||
if traj['instr_id'] in self.results:
|
||||
looped = True
|
||||
else:
|
||||
|
||||
@ -45,3 +45,8 @@ def construct_reverie_instrs(anno_dir, dataset, splits):
|
||||
del new_item['instr_encodings']
|
||||
data.append(new_item)
|
||||
return data
|
||||
|
||||
def load_json(f):
|
||||
with open(f) as fp:
|
||||
data = json.load(fp)
|
||||
return data
|
||||
|
||||
@ -14,6 +14,17 @@ from utils.graph_utils import NavGraph
|
||||
|
||||
ERROR_MARGIN = 3.0
|
||||
|
||||
obj2vps = {}
|
||||
bbox_data = json.load(open('/data/Matterport3DSimulator-duet/VLN-DUET/datasets/REVERIE/annotations/BBoxes.json'))
|
||||
for scanvp, value in bbox_data.items():
|
||||
scan, vp = scanvp.split('_')
|
||||
# for all visible objects at that viewpoint
|
||||
for objid, objinfo in value.items():
|
||||
if objinfo['visible_pos']:
|
||||
# if such object not already in the dict
|
||||
obj2vps.setdefault(scan+'_'+objid, [])
|
||||
obj2vps[scan+'_'+objid].append(vp)
|
||||
|
||||
def load_floorplan():
|
||||
region_label_lookup = load_region_label_lookup()
|
||||
|
||||
@ -118,6 +129,8 @@ def load_region_label_lookup():
|
||||
}
|
||||
return region_label_lookup
|
||||
|
||||
with open('./node_region.json') as fp:
|
||||
node_region = json.load(fp)
|
||||
|
||||
class Simulator(object):
|
||||
''' A simple simulator in Matterport3D environment '''
|
||||
@ -143,33 +156,35 @@ class Simulator(object):
|
||||
viewpoint_ID: str,
|
||||
heading: int,
|
||||
elevation: int,
|
||||
stop: str,
|
||||
start: str,
|
||||
target: str
|
||||
target: str,
|
||||
clip_target: str,
|
||||
):
|
||||
self.heading = heading
|
||||
self.elevation = elevation
|
||||
self.scan_ID = scan_ID
|
||||
self.viewpoint_ID = viewpoint_ID
|
||||
self.stop = stop
|
||||
self.start = start
|
||||
self.target = target
|
||||
self.clip_target = clip_target
|
||||
# Load navigable dict
|
||||
navigable_path = os.path.join(self.navigable_dir, self.scan_ID + '_navigable.json')
|
||||
with open(navigable_path, 'r') as f:
|
||||
navigable_dict = json.load(f)
|
||||
self.navigable_dict = json.load(f)
|
||||
|
||||
'''
|
||||
self.navigable_dict = {}
|
||||
for start, v in navigable_dict.items():
|
||||
self.navigable_dict[start] = {}
|
||||
print("BEFORE: ", len(navigable_dict[start]))
|
||||
# print("BEFORE: ", len(navigable_dict[start]))
|
||||
for to, _v in navigable_dict[start].items():
|
||||
start_region = self.node_region[scan_ID][start]
|
||||
to_region = self.node_region[scan_ID][to]
|
||||
if start_region == to_region:
|
||||
self.navigable_dict[start][to] = _v
|
||||
print(start_region, to_region)
|
||||
print("AFTER: ", len(self.navigable_dict[start]))
|
||||
# print(start_region, to_region)
|
||||
# print("AFTER: ", len(self.navigable_dict[start]))
|
||||
'''
|
||||
|
||||
# Get candidate
|
||||
self.getCandidate()
|
||||
@ -186,9 +201,9 @@ class Simulator(object):
|
||||
'heading': self.heading,
|
||||
'elevation': self.elevation,
|
||||
'candidate': self.candidate,
|
||||
'stop': self.stop,
|
||||
'start': self.start,
|
||||
'target': self.target
|
||||
'target': self.target,
|
||||
'clip_target': self.clip_target,
|
||||
}
|
||||
return self.state
|
||||
|
||||
@ -233,9 +248,9 @@ class EnvBatch(object):
|
||||
def _make_id(self, scanId, viewpointId):
|
||||
return scanId + '_' + viewpointId
|
||||
|
||||
def newEpisodes(self, scanIds, viewpointIds, headings, stops, starts, targets):
|
||||
for i, (scanId, viewpointId, heading, stop, start, target) in enumerate(zip(scanIds, viewpointIds, headings, stops, starts, targets)):
|
||||
self.sims[i].newEpisode(scanId, viewpointId, heading, 0, stop, start, target)
|
||||
def newEpisodes(self, scanIds, viewpointIds, headings, starts, targets, clip_targets):
|
||||
for i, (scanId, viewpointId, heading, start, target, clip_target) in enumerate(zip(scanIds, viewpointIds, headings, starts, targets, clip_targets)):
|
||||
self.sims[i].newEpisode(scanId, viewpointId, heading, 0, start, target, clip_target)
|
||||
|
||||
def getStates(self):
|
||||
"""
|
||||
@ -263,7 +278,7 @@ class REVERIENavBatch(object):
|
||||
|
||||
def __init__(
|
||||
self, view_db, instr_data, connectivity_dir, navigable_dir,
|
||||
batch_size=1, seed=0, name=None, data_limit=100
|
||||
batch_size=1, seed=0, name=None
|
||||
):
|
||||
self.env = EnvBatch(navigable_dir, feat_db=view_db, batch_size=batch_size)
|
||||
self.data = instr_data
|
||||
@ -272,14 +287,15 @@ class REVERIENavBatch(object):
|
||||
self.batch_size = batch_size
|
||||
self.name = name
|
||||
|
||||
#self.gt_trajs = self._get_gt_trajs(self.data) # for evaluation
|
||||
self.gt_trajs = self._get_gt_trajs(self.data) # for evaluation
|
||||
|
||||
# use different seeds in different processes to shuffle data
|
||||
'''
|
||||
self.seed = seed
|
||||
random.seed(self.seed)
|
||||
random.shuffle(self.data)
|
||||
'''
|
||||
|
||||
self.data = self.data[:data_limit]
|
||||
|
||||
self.ix = 0
|
||||
self._load_nav_graphs()
|
||||
@ -288,14 +304,12 @@ class REVERIENavBatch(object):
|
||||
print('%s loaded with %d instructions, using splits: %s' % (
|
||||
self.__class__.__name__, len(self.data), self.name))
|
||||
|
||||
'''
|
||||
def _get_gt_trajs(self, data):
|
||||
gt_trajs = {
|
||||
x['instr_id']: (x['scan'], x['path']) \
|
||||
x['new_reverie_id']: (x['scan'], x['path']) \
|
||||
for x in data if len(x['path']) > 1
|
||||
}
|
||||
return gt_trajs
|
||||
'''
|
||||
|
||||
def size(self):
|
||||
return len(self.data)
|
||||
@ -350,7 +364,7 @@ class REVERIENavBatch(object):
|
||||
'obs' : feature["detail"],
|
||||
'obs_summary' : feature["summary"],
|
||||
'objects' : feature["objects"],
|
||||
'instr_id' : item['instr_id'],
|
||||
# 'instr_id' : item['instr_id'],
|
||||
# 'action_plan' : item['action_plan'],
|
||||
'scan' : state['scanID'],
|
||||
'viewpoint' : state['viewpointID'],
|
||||
@ -360,9 +374,10 @@ class REVERIENavBatch(object):
|
||||
'instruction' : item['instruction'],
|
||||
'gt_path' : item['path'],
|
||||
'path_id' : item['path_id'],
|
||||
'stop': item['stop'],
|
||||
'start': item['start'],
|
||||
'target': item['target']
|
||||
'new_reverie_id': item['new_reverie_id'],
|
||||
'target': item['target'],
|
||||
'clip_target': item['clip_target']
|
||||
}
|
||||
# RL reward. The negative distance between the state and the final state
|
||||
# There are multiple gt end viewpoints on REVERIE.
|
||||
@ -384,10 +399,10 @@ class REVERIENavBatch(object):
|
||||
scanIds = [item['scan'] for item in self.batch]
|
||||
viewpointIds = [item['path'][0] for item in self.batch]
|
||||
headings = [item['heading'] for item in self.batch]
|
||||
stops = [item['stop'] for item in self.batch]
|
||||
starts = [item['start'] for item in self.batch]
|
||||
targets = [item['target'] for item in self.batch]
|
||||
self.env.newEpisodes(scanIds, starts, headings, stops, starts, targets)
|
||||
clip_targets = [item['clip_target'] for item in self.batch]
|
||||
self.env.newEpisodes(scanIds, starts, headings, starts, targets, clip_targets)
|
||||
return self._get_obs()
|
||||
|
||||
def step(self, next_viewpoint_IDs):
|
||||
@ -406,13 +421,13 @@ class REVERIENavBatch(object):
|
||||
near_d = d
|
||||
return near_id
|
||||
|
||||
def _eval_item(self, scan, pred_path, gt_path):
|
||||
def _eval_item(self, scan, pred_path, gt_path, gt_found, found, gt_objid):
|
||||
scores = {}
|
||||
|
||||
shortest_distances = self.shortest_distances[scan]
|
||||
|
||||
path = sum(pred_path, [])
|
||||
assert gt_path[0] == path[0], 'Result trajectories should include the start position'
|
||||
# assert gt_path[0] == path[0], 'Result trajectories should include the start position'
|
||||
|
||||
nearest_position = self._get_nearest(shortest_distances, gt_path[-1], path)
|
||||
|
||||
@ -424,10 +439,28 @@ class REVERIENavBatch(object):
|
||||
scores['trajectory_lengths'] = np.sum([shortest_distances[a][b] for a, b in zip(path[:-1], path[1:])])
|
||||
|
||||
gt_lengths = np.sum([shortest_distances[a][b] for a, b in zip(gt_path[:-1], gt_path[1:])])
|
||||
|
||||
scores['found_success'] = float(gt_found == found)
|
||||
|
||||
goal_viewpoints = set(obj2vps['%s_%s'%(scan, str(gt_objid))])
|
||||
|
||||
pred_stop_region = node_region[scan][path[-1]]
|
||||
gt_stop_region = node_region[scan][gt_path[-1]]
|
||||
|
||||
# scores['success'] = float(scores['nav_error'] < ERROR_MARGIN)
|
||||
scores['success'] = float(path[-1] in goal_viewpoints)
|
||||
scores['room_success'] = float(gt_stop_region == pred_stop_region)
|
||||
# scores['oracle_success'] = float(scores['oracle_error'] < ERROR_MARGIN)
|
||||
scores['oracle_success'] = float(any(x in goal_viewpoints for x in path))
|
||||
|
||||
scores['success'] = float(scores['nav_error'] < ERROR_MARGIN)
|
||||
scores['spl'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01)
|
||||
scores['oracle_success'] = float(scores['oracle_error'] < ERROR_MARGIN)
|
||||
scores['sspl_1'] = scores['success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
|
||||
scores['sspl_2'] = scores['room_success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
|
||||
scores['sspl_3'] = scores['oracle_success'] * gt_lengths / max(scores['trajectory_lengths'], gt_lengths, 0.01) * scores['found_success']
|
||||
|
||||
scores['ss_1'] = scores['success'] * scores['found_success']
|
||||
scores['ss_2'] = scores['room_success'] * scores['found_success']
|
||||
scores['ss_3'] = scores['oracle_success'] * scores['found_success']
|
||||
|
||||
scores.update(
|
||||
cal_dtw(shortest_distances, path, gt_path, scores['success'], ERROR_MARGIN)
|
||||
@ -445,8 +478,9 @@ class REVERIENavBatch(object):
|
||||
for item in preds:
|
||||
instr_id = item['instr_id']
|
||||
traj = item['trajectory']
|
||||
obj_id = instr_id.split('_')[1]
|
||||
scan, gt_traj = self.gt_trajs[instr_id]
|
||||
traj_scores = self._eval_item(scan, traj, gt_traj)
|
||||
traj_scores = self._eval_item(scan, traj, gt_traj, item['gt_found'], item['found'], obj_id)
|
||||
for k, v in traj_scores.items():
|
||||
metrics[k].append(v)
|
||||
metrics['instr_id'].append(instr_id)
|
||||
@ -458,8 +492,16 @@ class REVERIENavBatch(object):
|
||||
'nav_error': np.mean(metrics['nav_error']),
|
||||
'oracle_error': np.mean(metrics['oracle_error']),
|
||||
'sr': np.mean(metrics['success']) * 100,
|
||||
'room_success': np.mean(metrics['room_success']) * 100,
|
||||
'found_success': np.mean(metrics['found_success']) * 100,
|
||||
'oracle_sr': np.mean(metrics['oracle_success']) * 100,
|
||||
'spl': np.mean(metrics['spl']) * 100,
|
||||
'sspl_1': np.mean(metrics['sspl_1']) * 100,
|
||||
'sspl_2': np.mean(metrics['sspl_2']) * 100,
|
||||
'sspl_3': np.mean(metrics['sspl_3']) * 100,
|
||||
'ss_1': np.mean(metrics['ss_1']) * 100,
|
||||
'ss_2': np.mean(metrics['ss_2']) * 100,
|
||||
'ss_3': np.mean(metrics['ss_3']) * 100,
|
||||
'nDTW': np.mean(metrics['nDTW']) * 100,
|
||||
'SDTW': np.mean(metrics['SDTW']) * 100,
|
||||
'CLS': np.mean(metrics['CLS']) * 100,
|
||||
|
||||
@ -7,8 +7,8 @@ def parse_args():
|
||||
|
||||
# datasets
|
||||
parser.add_argument('--root_dir', type=str, default='../datasets')
|
||||
parser.add_argument('--dataset', type=str, default='r2r', choices=['r2r', 'r4r'])
|
||||
parser.add_argument('--output_dir', type=str, default='../datasets/R2R/exprs/gpt-3.5-turbo', help='experiment id')
|
||||
parser.add_argument('--dataset', type=str, default='reverie', choices=['r2r', 'r4r', 'reverie'])
|
||||
parser.add_argument('--output_dir', type=str, default='../datasets/REVERIE/exprs/gpt-3.5-turbo', help='experiment id')
|
||||
# parser.add_argument('--output_dir', type=str, default='../datasets/R2R/exprs/LlaMA-2-13b-test', help='experiment id')
|
||||
parser.add_argument('--seed', type=int, default=0)
|
||||
|
||||
@ -21,7 +21,7 @@ def parse_args():
|
||||
parser.add_argument('--max_iterations', type=int, default=25)
|
||||
|
||||
# General config
|
||||
parser.add_argument('--iters', type=int, default=10, help='number of iterations to run')
|
||||
parser.add_argument('--iters', type=int, default=None, help='number of iterations to run')
|
||||
# parser.add_argument('--iters', type=int, default=None, help='number of iterations to run')
|
||||
parser.add_argument('--max_scratchpad_length', type=int, default=1000, help='max number of steps in an episode')
|
||||
parser.add_argument('--test', action='store_true', default=False)
|
||||
|
||||
@ -244,15 +244,24 @@ Instruction: {action_plan}
|
||||
Initial Observation: {init_observation}
|
||||
Thought: I should start navigation according to the instruction, {agent_scratchpad}"""
|
||||
|
||||
VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate an indoor environment to reach a target viewpoint based on a given instruction, performing the Vision and Language Navigation (VLN) task. You'll move among static positions within a pre-defined graph, aiming for minimal steps.
|
||||
VLN_GPT35_PROMPT = """As an intelligent embodied agent, you will navigate in an indoor environment to reach a target viewpoint based on a given instruction, performing the Vision and Language Navigation (VLN) task.
|
||||
|
||||
The instruction will let you find all the target objects in a building.
|
||||
|
||||
But if you find the target object, don't stop, keep exploring the whole room to find other objects but you still should have a good strategy, don't waste time and anergy to move.
|
||||
|
||||
You will move among static positions within a pre-defined graph, aiming for the nearest position to the object if the object is present.
|
||||
|
||||
You will receive a trajectory instruction at the start and will have access to step history (your Thought, Action, Action Input and Obeservation after the Begin! sign) and current viewpoint observation (including scene descriptions, objects, and navigable directions/distances within 3 meters) during navigation. Orientations range from -180 to 180 degrees, with 0 being forward, right 90 rightward, right/left 180 backward, and left 90 leftward.
|
||||
|
||||
Explore the environment and don't stay at the original point. Keep Walking! Reach within 3 meters of the instructed destination, and if it's visible but no objects are detected, move closer.
|
||||
- Notice: You should have a good strategy to check whether the target object exists in the target room, and stop when you exploring all viewpoint in the target room.
|
||||
|
||||
If you find the object but I haven't said you can stop. You cannot say you have finished the task! Keep exploring the nearby area.
|
||||
Explore the environment while avoiding revisiting viewpoints by comparing current and previously visited IDs
|
||||
|
||||
continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
|
||||
If you think you are moving in circles, please stop and think whether any other objects may be hiden. If no, please output 'Final Answer: Not found'.
|
||||
|
||||
Continue by considering your location and the next viewpoint based on the instruction, using the action_maker tool.
|
||||
And if you explored all the target room(no other viewpoint to move to), stop and output 'Final Answer: Not found!'.
|
||||
Show your reasoning in the Thought section.
|
||||
|
||||
Follow the given format and use provided tools.
|
||||
@ -260,18 +269,27 @@ Follow the given format and use provided tools.
|
||||
Do not fabricate nonexistent viewpoint IDs.
|
||||
|
||||
----
|
||||
Starting below, you should follow this format:
|
||||
Starting below, you should follow this format, do not use other format:
|
||||
|
||||
Instruction: the instruction describing the whole trajectory
|
||||
Initial Observation: the initial observation of the environment
|
||||
Thought: you should always think about what to do next and why
|
||||
Action: the action to take, must be one of the tools [{tool_names}]
|
||||
Action Input: "Viewpoint ID" but do not stay in the original viewpoint
|
||||
Action Input: "Viewpoint ID", you should not choose object name or others, please only output "Viewpoint ID"
|
||||
Observation: the result of the action
|
||||
... (this Thought/Action/Action Input/Observation can repeat N times)
|
||||
Thought: I found my target object, but I should check whether any other objects may be hidden.
|
||||
|
||||
or
|
||||
|
||||
Thought: I checked that no objects are hidden, I can stop.
|
||||
Final Answer: Not found!
|
||||
|
||||
----
|
||||
|
||||
Begin!
|
||||
|
||||
Instruction: {action_plan}
|
||||
Initial Observation: {init_observation}
|
||||
Thought: I should start navigation according to the instruction, {agent_scratchpad}"""
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user