feat: dataset

2024-05-12 00:43:50 +08:00 · 2024-05-12 00:43:50 +08:00 · bf905e9e03
commit bf905e9e03
parent de44cad219
1 changed files with 37 additions and 0 deletions
--- a/train/dataset.py
+++ b/train/dataset.py
@ -0,0 +1,37 @@
+import os
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+
+class Cifar10Dataset(Dataset):
+    def __init__(self, data_dir):
+        self.imgs = []
+        self.labels = []
+        for file in os.listdir(data_dir):
+            if 'data_batch' in file:
+                batch = self.unpickle(f'{data_dir}/{file}')
+
+                length = len(batch[b'data'])
+                self.labels += batch[b'labels']
+
+                # read image data
+                values = np.array(batch[b'data']) / 255.0
+                imgs = np.zeros((length, 3, 32, 32))
+                for index in range(length):
+                    for channel in range(3):
+                        imgs[index][channel] = values[index][32*32*channel : 32*32*(channel+1)].reshape((32, 32))
+                self.imgs.append(imgs)
+        self.imgs = np.concatenate(self.imgs)
+        print(f"load images : {self.imgs.shape}")
+        print(f"load labels : {len(self.labels)}")
+
+    def unpickle(self, file):
+        import pickle
+        with open(file, 'rb') as fo:
+            dict = pickle.load(fo, encoding='bytes')
+        return dict
+
+    def __getitem__(self, index):
+        return self.imgs[index], self.labels[index]
+
+    def __len__(self):
+        return len(self.imgs)