From 4073c52bb813c2f71c6d11c8481a131ba9f3c1e4 Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Wed, 8 Nov 2023 23:17:13 +0800 Subject: [PATCH] fix: original json contains unicode characters --- adversarial_summary.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/adversarial_summary.py b/adversarial_summary.py index eb63d89..de45c75 100644 --- a/adversarial_summary.py +++ b/adversarial_summary.py @@ -1,5 +1,10 @@ import json import os +import re + +def remove_non_ascii(text): + return re.sub(r'[^\x00-\x7F]', ' ', text) + for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train_seen']: print(file) @@ -18,13 +23,13 @@ for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train 'path_id': int(i['path_id']), 'path': i['path'], 'heading': float(i['heading']), - 'instructions': [ i['instruction'] ], + 'instructions': [ remove_non_ascii(i['instruction'])], 'found': [ i['found'] ], 'id': i['id'], 'objId': i['objId'] } else: - result[instruction_id]['instructions'].append(i['instruction']) + result[instruction_id]['instructions'].append(remove_non_ascii(i['instruction'])) result[instruction_id]['found'].append( i['found'] ) output = []