fix: original json contains unicode characters
This commit is contained in:
parent
595866c2f4
commit
4073c52bb8
@ -1,5 +1,10 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
def remove_non_ascii(text):
|
||||||
|
return re.sub(r'[^\x00-\x7F]', ' ', text)
|
||||||
|
|
||||||
|
|
||||||
for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train_seen']:
|
for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train_seen']:
|
||||||
print(file)
|
print(file)
|
||||||
@ -18,13 +23,13 @@ for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train
|
|||||||
'path_id': int(i['path_id']),
|
'path_id': int(i['path_id']),
|
||||||
'path': i['path'],
|
'path': i['path'],
|
||||||
'heading': float(i['heading']),
|
'heading': float(i['heading']),
|
||||||
'instructions': [ i['instruction'] ],
|
'instructions': [ remove_non_ascii(i['instruction'])],
|
||||||
'found': [ i['found'] ],
|
'found': [ i['found'] ],
|
||||||
'id': i['id'],
|
'id': i['id'],
|
||||||
'objId': i['objId']
|
'objId': i['objId']
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
result[instruction_id]['instructions'].append(i['instruction'])
|
result[instruction_id]['instructions'].append(remove_non_ascii(i['instruction']))
|
||||||
result[instruction_id]['found'].append( i['found'] )
|
result[instruction_id]['found'].append( i['found'] )
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user