diff --git a/adversarial_summary.py b/adversarial_summary.py index eb63d89..de45c75 100644 --- a/adversarial_summary.py +++ b/adversarial_summary.py @@ -1,5 +1,10 @@ import json import os +import re + +def remove_non_ascii(text): + return re.sub(r'[^\x00-\x7F]', ' ', text) + for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train_seen']: print(file) @@ -18,13 +23,13 @@ for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train 'path_id': int(i['path_id']), 'path': i['path'], 'heading': float(i['heading']), - 'instructions': [ i['instruction'] ], + 'instructions': [ remove_non_ascii(i['instruction'])], 'found': [ i['found'] ], 'id': i['id'], 'objId': i['objId'] } else: - result[instruction_id]['instructions'].append(i['instruction']) + result[instruction_id]['instructions'].append(remove_non_ascii(i['instruction'])) result[instruction_id]['found'].append( i['found'] ) output = []