fix: original json contains unicode characters
This commit is contained in:
parent
595866c2f4
commit
4073c52bb8
@ -1,5 +1,10 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
def remove_non_ascii(text):
|
||||
return re.sub(r'[^\x00-\x7F]', ' ', text)
|
||||
|
||||
|
||||
for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train_seen']:
|
||||
print(file)
|
||||
@ -18,13 +23,13 @@ for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train
|
||||
'path_id': int(i['path_id']),
|
||||
'path': i['path'],
|
||||
'heading': float(i['heading']),
|
||||
'instructions': [ i['instruction'] ],
|
||||
'instructions': [ remove_non_ascii(i['instruction'])],
|
||||
'found': [ i['found'] ],
|
||||
'id': i['id'],
|
||||
'objId': i['objId']
|
||||
}
|
||||
else:
|
||||
result[instruction_id]['instructions'].append(i['instruction'])
|
||||
result[instruction_id]['instructions'].append(remove_non_ascii(i['instruction']))
|
||||
result[instruction_id]['found'].append( i['found'] )
|
||||
|
||||
output = []
|
||||
|
||||
Loading…
Reference in New Issue
Block a user