diff --git a/r2r_src/agent.py b/r2r_src/agent.py index d8f8ef4..057aa57 100644 --- a/r2r_src/agent.py +++ b/r2r_src/agent.py @@ -35,12 +35,12 @@ class BaseAgent(object): self.losses = [] # For learning agents def write_results(self): - output = [{'instr_id':k, 'trajectory': v} for k,v in self.results.items()] + output = [{'instr_id':k, 'trajectory': v[0], 'found': v[1]} for k,v in self.results.items()] with open(self.results_path, 'w') as f: json.dump(output, f) def get_results(self): - output = [{'instr_id': k, 'trajectory': v} for k, v in self.results.items()] + output = [{'instr_id': k, 'trajectory': v[0], 'found': v[1]} for k, v in self.results.items()] return output def rollout(self, **args): @@ -61,17 +61,21 @@ class BaseAgent(object): if iters is not None: # For each time, it will run the first 'iters' iterations. (It was shuffled before) for i in range(iters): - for traj in self.rollout(**kwargs): + trajs, found = self.rollout(**kwargs) + print(found) + for index, traj in enumerate(trajs): self.loss = 0 - self.results[traj['instr_id']] = traj['path'] + self.results[traj['instr_id']] = (traj['path'], found[index]) else: # Do a full round while True: - for traj in self.rollout(**kwargs): + trajs, found = self.rollout(**kwargs) + print("FOUND: ", found) + for index, traj in enumerate(trajs): if traj['instr_id'] in self.results: looped = True else: self.loss = 0 - self.results[traj['instr_id']] = traj['path'] + self.results[traj['instr_id']] = (traj['path'], found[index]) if looped: break @@ -155,7 +159,9 @@ class Seq2SeqAgent(BaseAgent): for i, ob in enumerate(obs): for j, cc in enumerate(ob['candidate']): candidate_feat[i, j, :] = cc['feature'] - candidate_feat[i, -1, :] = np.ones((self.feature_size + args.angle_feat_size)) + + # 補上 not fount token + candidate_feat[i, len(ob['candidate'])+1, :] = np.ones((self.feature_size + args.angle_feat_size)) return torch.from_numpy(candidate_feat).cuda(), candidate_leng @@ -188,12 +194,13 @@ class Seq2SeqAgent(BaseAgent): else: # Stop here assert ob['teacher'] == ob['viewpoint'] # The teacher action should be "STAY HERE" if ob['swap']: # instruction 有被換過,所以要 not found - a[i] = len(ob['candidate']) - else: # STOP a[i] = len(ob['candidate'])-1 + else: # STOP + a[i] = len(ob['candidate'])-2 + print(" ", a) return torch.from_numpy(a).cuda() - def make_equiv_action(self, a_t, perm_obs, perm_idx=None, traj=None): + def make_equiv_action(self, a_t, perm_obs, perm_idx=None, traj=None, found=None): """ Interface between Panoramic view and Egocentric view It will convert the action panoramic view action a_t to equivalent egocentric view actions for the simulator @@ -209,7 +216,7 @@ class Seq2SeqAgent(BaseAgent): for i, idx in enumerate(perm_idx): action = a_t[i] - print('action: ', action) + # print('action: ', action) if action != -1 and action != -2: # -1 is the action select_candidate = perm_obs[i]['candidate'][action] src_point = perm_obs[i]['viewIndex'] @@ -233,11 +240,17 @@ class Seq2SeqAgent(BaseAgent): # print("action: {} view_index: {}".format(action, state.viewIndex)) if traj is not None: traj[i]['path'].append((state.location.viewpointId, state.heading, state.elevation)) + else: + found[i] = action + + + + ''' elif action == -1: print('') elif action == -2: print('') - + ''' def rollout(self, train_ml=None, train_rl=True, reset=True): """ @@ -247,6 +260,7 @@ class Seq2SeqAgent(BaseAgent): :return: """ + print("ROLLOUT!!!") if self.feedback == 'teacher' or self.feedback == 'argmax': train_rl = False @@ -283,6 +297,9 @@ class Seq2SeqAgent(BaseAgent): 'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])], } for ob in perm_obs] + found = [None for _ in range(len(perm_obs))] + + # Init the reward shaping last_dist = np.zeros(batch_size, np.float32) last_ndtw = np.zeros(batch_size, np.float32) @@ -306,6 +323,7 @@ class Seq2SeqAgent(BaseAgent): input_a_t, candidate_feat, candidate_leng = self.get_input_feat(perm_obs) + # the first [CLS] token, initialized by the language BERT, serves # as the agent's state passing through time steps if (t >= 1) or (args.vlnbert=='prevalent'): @@ -337,9 +355,10 @@ class Seq2SeqAgent(BaseAgent): # Supervised training target = self._teacher_action(perm_obs, ended) for i, d in enumerate(target): - print(perm_obs[i]['swap'], perm_obs[i]['instructions']) - print(d) + # print(perm_obs[i]['swap'], perm_obs[i]['instructions']) + # print(d) _, at_t = logit.max(1) + ''' if at_t[i].item() == candidate_leng[i]-1: print("-2") elif at_t[i].item() == candidate_leng[i]-2: @@ -347,14 +366,19 @@ class Seq2SeqAgent(BaseAgent): else: print(at_t[i].item()) print() + ''' ml_loss += self.criterion(logit, target) + a_predict = None # Determine next model inputs if self.feedback == 'teacher': a_t = target # teacher forcing + _, a_predict = logit.max(1) + a_predict = a_predict.detach() elif self.feedback == 'argmax': _, a_t = logit.max(1) # student forcing - argmax a_t = a_t.detach() + a_predict = a_t.detach() log_probs = F.log_softmax(logit, 1) # Calculate the log_prob here policy_log_probs.append(log_probs.gather(1, a_t.unsqueeze(1))) # Gather the log_prob for each batch elif self.feedback == 'sample': @@ -362,23 +386,39 @@ class Seq2SeqAgent(BaseAgent): c = torch.distributions.Categorical(probs) self.logs['entropy'].append(c.entropy().sum().item()) # For log entropys.append(c.entropy()) # For optimization - a_t = c.sample().detach() + new_c = c.sample() + a_t = new_c.detach() + a_predict = new_c.detach() policy_log_probs.append(c.log_prob(a_t)) else: - print(self.feedback) + # print(self.feedback) sys.exit('Invalid feedback option') # Prepare environment action # NOTE: Env action is in the perm_obs space cpu_a_t = a_t.cpu().numpy() for i, next_id in enumerate(cpu_a_t): - if next_id == (candidate_leng[i]-2) or next_id == args.ignoreid or ended[i]: # The last action is + if next_id == args.ignoreid or ended[i]: + if found[i] == True: + cpu_a_t[i] = -1 # Change the and ignore action to -1 + else: + cpu_a_t[i] = -2 + elif next_id == (candidate_leng[i]-2): cpu_a_t[i] = -1 # Change the and ignore action to -1 elif next_id == (candidate_leng[i]-1): cpu_a_t[i] = -2 + + cpu_a_predict = a_predict.cpu().numpy() + for i, next_id in enumerate(cpu_a_predict): + if next_id == (candidate_leng[i]-2): + cpu_a_predict[i] = -1 # Change the and ignore action to -1 + elif next_id == (candidate_leng[i]-1): + cpu_a_predict[i] = -2 + # Make action and get the new state - self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj) + print(cpu_a_t) + self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj, found=found) obs = np.array(self.env._get_obs()) perm_obs = obs[perm_idx] # Perm the obs for the resu @@ -445,7 +485,7 @@ class Seq2SeqAgent(BaseAgent): if ended.all(): break - print() + # print() if train_rl: # Last action in A2C @@ -517,8 +557,9 @@ class Seq2SeqAgent(BaseAgent): self.losses.append(0.) else: self.losses.append(self.loss.item() / self.episode_len) # This argument is useless. + print("\n\n") - return traj + return traj, found def test(self, use_dropout=False, feedback='argmax', allow_cheat=False, iters=None): ''' Evaluate once on each instruction in the current environment ''' diff --git a/r2r_src/train.py b/r2r_src/train.py index aa9b5ef..1b64f4e 100644 --- a/r2r_src/train.py +++ b/r2r_src/train.py @@ -199,7 +199,8 @@ def train_val(test_only=False): else: featurized_scans = set([key.split("_")[0] for key in list(feat_dict.keys())]) # val_env_names = ['val_train_seen', 'val_seen', 'val_unseen'] - val_env_names = ['val_train_seen'] + # val_env_names = ['val_train_seen'] + val_env_names = ['val_unseen'] train_env = R2RBatch(feat_dict, batch_size=args.batchSize, splits=['train'], tokenizer=tok) from collections import OrderedDict