From 4073c52bb813c2f71c6d11c8481a131ba9f3c1e4 Mon Sep 17 00:00:00 2001
From: Ting-Jun Wang <levi900227@gmail.com>
Date: Wed, 8 Nov 2023 23:17:13 +0800
Subject: [PATCH] fix: original json contains unicode characters

---
 adversarial_summary.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/adversarial_summary.py b/adversarial_summary.py
index eb63d89..de45c75 100644
--- a/adversarial_summary.py
+++ b/adversarial_summary.py
@@ -1,5 +1,10 @@
 import json
 import os
+import re
+
+def remove_non_ascii(text):
+    return re.sub(r'[^\x00-\x7F]', ' ', text)
+
 
 for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train_seen']:
     print(file)
@@ -18,13 +23,13 @@ for file in ['train', 'val_unseen', 'val_seen', 'train_seen', 'test', 'val_train
                     'path_id': int(i['path_id']),
                     'path': i['path'],
                     'heading': float(i['heading']),
-                    'instructions': [ i['instruction'] ],
+                    'instructions': [ remove_non_ascii(i['instruction'])], 
                     'found': [ i['found'] ],
                     'id': i['id'],
                     'objId': i['objId']
                 }
             else:
-                result[instruction_id]['instructions'].append(i['instruction'])
+                result[instruction_id]['instructions'].append(remove_non_ascii(i['instruction']))
                 result[instruction_id]['found'].append( i['found'] )
 
         output = []