该数据集共有60000张彩色图像,这些图像是32*32,分为10个类,每类6000张图。这里面有50000张用于训练,构成了5个训练批,每一批10000张图;另外10000用于测试,单独构成一批。测试批的数据里,取自10类中的每一类,每一类随机取1000张。抽剩下的就随机排列组成了训练批。注意一个训练批中的各类图像并不一定数量相同,总的来看训练批,每一类都有5000张图。
我们可以看到下载后的数据集文件夹包括:
其中data_batch就是用于训练的数据集,一共有五个;test_batch就是用于测试的数据集,一共有一个;batchs.meta其中包含的信息是类别名称,下面我们就仔细看看各个文件包含的具体内容:
import pickle def unpickle(file): with open(file, 'rb') as fo: dict = pickle.load(fo, encoding='latin-1') return dict cc=unpickle("./dataset/cifar-10/cifar-10-batches-py/batches.meta") #路径需要自行修改 print(cc)
运行结果:
可以看到:batch应该是一个batch包含的图片数量,label_names应该是类别名字,num_vis应该是32*32(图片大小)*3(RGB三通道),个人理解为描述一副图像所需要的值。
debug结果:代码不加赘述,上面的改下路径就行
可以看到,其中包含batch_label,labels,data,filenames四项,其中batch_label就是第几个batch,labels就是第几类,data就是上面3072对应的具体值啦(不止一个3017,因为不止一幅图,大家自行理解哈),其实就是RGB值,filename就是图片的名字,是不是挺简单的。test_batch也是一样的样的道理。
接下来就是具体自己的数据集转换的代码:
# -*- coding: utf-8 -*- """ @author: zhangjiaqing 有借鉴 """ import numpy as np import chardet from PIL import Image import operator from os import listdir import sys import pickle import random def unpickle(file): with open(file, 'rb') as fo: dict = pickle.load(fo, encoding='latin-1') return dict #cc=unpickle("./dataset/cifar-10/cifar-10-batches-py/data_batch_1") #print(cc) data={} list1=[] list2=[] list3=[] #将图片转化为32*32的三通道图片 def img_tra(): for k in range(0,num): currentpath=folder+"/"+imglist[k] im=Image.open(currentpath) #width=im.size[0] #height=im.size[1] x_s=32 y_s=32 out = im.resize((x_s,y_s),Image.ANTIALIAS) out.save(folder_ad+"/"+str(imglist[k])) def addWord(theIndex,word,adder): theIndex.setdefault(word,[]).append(adder) def seplabel(fname): filestr=fname.split(".")[0] label=int(filestr.split("_")[0]) #图片的命名 _前面是类别 return label def mkcf(): global data global list1 global list2 global list3 for k in range(0,num): currentpath=folder_ad+"/"+imglist[k] im=Image.open(currentpath) with open(binpath, 'a') as f: for i in range (0,32): for j in range (0,32): cl=im.getpixel((i,j)) list1.append(cl[0]) #R for i in range (0,32): for j in range (0,32): cl=im.getpixel((i,j)) #with open(binpath, 'a') as f: #mid=str(cl[1]) #f.write(mid) list1.append(cl[1]) #G for i in range (0,32): for j in range (0,32): cl=im.getpixel((i,j)) list1.append(cl[2]) ##B list2.append(list1) list1=[] f.close() print("image"+str(k+1)+"saved.") list3.append(imglist[k]) #name of pictures arr2=np.array(list2,dtype=np.uint8) data['batch_label']='training batch 5 of 5' #training batch 1 of 5 testing batch 1 of 1 data.setdefault('labels',label) data.setdefault('data',arr2) data.setdefault('filenames',list3) output = open(binpath, 'wb') pickle.dump(data, output) output.close() folder="./cloud/train_batch_5" #自己图片的路径 train_batch_5 test folder_ad="./cloud/train_batch_5_ad" #将图片转化为32*32的三通道图片的路径 train_batch_5_ad test_ad imglist=listdir(folder) #这里原作者好像写错了,我自行修改了,目测现在是对的 num=len(imglist) img_tra() label=[] for i in range (0,num): label.append(seplabel(imglist[i])) binpath="./dataset/cloud/cloud-5-batches-py/data_batch_5" #保存的路径 data_batch_5 test_batch print(binpath) mkcf()
给大家看一下转的结果
解释一下为什么这里少了batch.meta,感觉这里的信息没什么用,如果是自己的数据集,可以直接写一句代码就行:
classes = ['A', 'B', 'C', 'D', 'E']
注意自己数据集图片的命名:‘_‘前面是类别
# -*- coding: utf-8 -*- """ @author: zhangjiaqing """ import numpy as np import chardet from PIL import Image import operator from os import listdir import sys import pickle import random from skimage.util.shape import view_as_windows def unpickle(file): with open(file, 'rb') as fo: dict = pickle.load(fo, encoding='latin-1') return dict #cc=unpickle("./dataset/cloud/cloud-5-batches-py/test_batch") #cc=unpickle("./dataset/cifar-10\cifar-10-batches-py/data_batch_1") #print(cc) data = {} list1 = [] list2 = [] list3 = [] label = [] size = 64 def split(): global label for k in range(0,num): currentpath=folder+imglist[k] img=Image.open(currentpath) img = np.array(img) img_block_2 = view_as_windows(img, (size, size, 3), step=size) hang = img_block_2.shape[0] lie = img_block_2.shape[1] label=[] img_block = np.zeros((size,size,3)) for i in range (hang): for j in range(lie): label.append(random.randint(0, 4)) img_block = img_block_2[i,j,0,:,:,:] image = Image.fromarray(img_block.astype('uint8')) image.save(folder_ad + '%s_%d_%d.jpg'%(str(imglist[k]),i*lie+j,label[i*lie+j])) #out.save(folder_ad+"/"+str(imglist[k])) def addWord(theIndex,word,adder): theIndex.setdefault(word,[]).append(adder) def mkcf(): global data global list1 global list2 global list3 global train for k in range(0,number): currentpath=folder_ad+imagelist[k] im=Image.open(currentpath) with open(binpath, 'a') as f: for i in range (0,size): for j in range (0,size): cl=im.getpixel((i,j)) list1.append(cl[0]) #R for i in range (0,size): for j in range (0,size): cl=im.getpixel((i,j)) list1.append(cl[1]) #G for i in range (0,size): for j in range (0,size): cl=im.getpixel((i,j)) list1.append(cl[2]) ##B list2.append(list1) list1=[] f.close() print("image"+str(k+1)+"saved.") list3.append(imagelist[k]) #name of pictures arr2=np.array(list2,dtype=np.uint8) if train: data['batch_label']='training batch 1 of 1' #training batch 1 of 5 testing batch 1 of 1 else: data['batch_label']='testing batch 1 of 1' #training batch 1 of 5 testing batch 1 of 1 data.setdefault('labels',label) data.setdefault('data',arr2) data.setdefault('filenames',list3) output = open(binpath, 'wb') pickle.dump(data, output) output.close() train = False #true就是训练集路径 #false就是测试集路径 if train: folder="./cloud/train_batch_1/" # train_batch_5 test folder_ad="./cloud/train_batch_1_ad/" #将图片转化为32*32的三通道图片 train_batch_5_ad test_ad binpath="./dataset/cloud/cloud-5-batches-py/data_batch_1" # data_batch_5 test_batch else: folder="./cloud/test/" # train_batch_5 test folder_ad="./cloud/test_ad/" #将图片转化为32*32的三通道图片 train_batch_5_ad test_ad binpath="./dataset/cloud/cloud-5-batches-py/test_batch" # data_batch_5 test_batch imglist=listdir(folder) num=len(imglist) split() imagelist=listdir(folder_ad) number=len(imagelist) mkcf() print('the work is finished!')