fix: original json contains unicode characters

This commit is contained in:
Ting-Jun Wang 2023-11-08 23:17:13 +08:00
parent 595866c2f4
commit 4073c52bb8
Signed by: snsd0805
GPG Key ID: 48D331A3D6160354

View File

@ -1,5 +1,10 @@
import json
import os
import re
def remove_non_ascii(text):
return re.sub(r'[^\x00-\x7F]', ' ', text)
for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train_seen']:
print(file)
@ -18,13 +23,13 @@ for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train
'path_id': int(i['path_id']),
'path': i['path'],
'heading': float(i['heading']),
'instructions': [ i['instruction'] ],
'instructions': [ remove_non_ascii(i['instruction'])],
'found': [ i['found'] ],
'id': i['id'],
'objId': i['objId']
}
else:
result[instruction_id]['instructions'].append(i['instruction'])
result[instruction_id]['instructions'].append(remove_non_ascii(i['instruction']))
result[instruction_id]['found'].append( i['found'] )
output = []