我的博客https://blog.justlovesmile.top/
目标检测是计算机视觉任务中的一个重要研究方向,其用于解决对数码图像中特定种类的可视目标实例的检测问题。目标检测作为计算机视觉的根本性问题之一,是其他诸多计算机视觉任务,例如图像描述生成,实例分割和目标跟踪的基础以及前提。而在解决此类问题时,我们常常需要使用自己的脚本或者利用标注工具生成数据集,数据集格式往往会多种多样,因此对于目标检测任务而言,为了更好地兼容训练,大多数目标检测模型框架会默认支持几种常用的数据集标注格式,常见的分别是COCO,Pascal VOC,YOLO等等。本文主要介绍上述几种数据集格式以及我写的Python脚本(一般需要根据实际情况再改改)。
COCO(Common Objects in COtext)数据集,是一个大规模的,适用于目标检测,图像分割,Image Captioning任务的数据集,其标注格式是最常用的几种格式之一。目前使用较多的是COCO2017数据集。其官网为COCO - Common Objects in Context (cocodataset.org)。
COCO数据集主要包含图像(jpg或者png等等)和标注文件(json),其数据集格式如下(/
代表文件夹):
-coco/ |-train2017/ |-1.jpg |-2.jpg |-val2017/ |-3.jpg |-4.jpg |-test2017/ |-5.jpg |-6.jpg |-annotations/ |-instances_train2017.json |-instances_val2017.json |-*.json
train2017
以及val2017
这两个文件夹中存储的是训练集和验证集的图像,而test2017
文件夹中存储的是测试集的信息,可以只是图像,也可以包含标注,一般是单独使用的。
annotations
文件夹中的文件就是标注文件,如果你有xml
文件,通常需要转换成json
格式,其格式如下(更详细的可以参考官网):
{ "info": info, "images": [image], //列表 "annotations": [annotation], //列表 "categories": [category], //列表 "licenses": [license], //列表 }
其中info
为整个数据集的信息,包括年份,版本,描述等等信息,如果只是完成训练任务,其实不太重要,如下所示:
//对于训练,不是那么的重要 info{ "year": int, "version": str, "description": str, "contributor": str, "url": str, "date_created": datetime, }
其中的image
为图像的基本信息,包括序号,宽高,文件名等等信息,其中的序号(id
)需要和后面的annotations
中的标注所属图片序号对应如下所示:
image{ "id": int, //必要 "width": int, //必要 "height": int, //必要 "file_name": str, //必要 "license": int, "flickr_url": str, "coco_url": str, "date_captured": datetime, }
其中的annotation
是最重要的标注信息,包括序号,所属图像序号,类别序号等等信息,如下所示:
annotation{ "id": int, //标注id "image_id": int, //所属图像id "category_id": int, //类别id "segmentation": RLE or [polygon], //图像分割标注 "area": float, //区域面积 "bbox": [x,y,width,height], //目标框左上角坐标以及宽高 "iscrowd": 0 or 1, //是否密集 }
其中的category
代表类别信息,包括父类别,类别序号以及类别名称,如下所示:
category{ "id": int, //类别序号 "name": str, //类别名称 "supercategory": str, //父类别 }
其中的license
代表数据集的协议许可信息,包括序号,协议名称以及链接信息,如下所示:
//对于训练,不重要 license{ "id": int, "name": str, "url": str, }
接下来,我们来看一个简单的示例:
{ "info": {略}, "images": [{"id": 1, "file_name": "1.jpg", "height": 334, "width": 500}, {"id": 2, "file_name": "2.jpg", "height": 445, "width": 556}], "annotations": [{"id": 1, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 3, "segmentation": []}, {"id": 2, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 2, "segmentation": []}, {"id": 3, "area": 40448, "iscrowd": 0, "image_id": 2, "bbox": [246, 61, 128, 316], "category_id": 1, "segmentation": []}], "categories": [{"supercategory": "none", "id": 1, "name": "liner"},{"supercategory": "none", "id": 2, "name": "containership"},{"supercategory": "none", "id": 3, "name": "bulkcarrier"}], "licenses": [{略}] }
Python转换脚本
如下所示,需要准备图像
和xml
标注文件:
# -*- coding: utf-8 -*- # @Author : justlovesmile # @Date : 2021/9/8 15:36 import os, random, json import shutil as sh from tqdm.auto import tqdm import xml.etree.ElementTree as xmlET def mkdir(path): if not os.path.exists(path): os.makedirs(path) return True else: print(f"The path ({path}) already exists.") return False def readxml(file): tree = xmlET.parse(file) #图片尺寸字段 size = tree.find('size') width = int(size.find('width').text) height = int(size.find('height').text) #目标字段 objs = tree.findall('object') bndbox = [] for obj in objs: label = obj.find("name").text bnd = obj.find("bndbox") xmin = int(bnd.find("xmin").text) ymin = int(bnd.find("ymin").text) xmax = int(bnd.find("xmax").text) ymax = int(bnd.find("ymax").text) bbox = [xmin, ymin, xmax, ymax, label] bndbox.append(bbox) return [[width, height], bndbox] def tococo(xml_root, image_root, output_root,classes={},errorId=[],train_percent=0.9): # assert assert train_percent<=1 and len(classes)>0 # define the root path train_root = os.path.join(output_root, "train2017") val_root = os.path.join(output_root, "val2017") ann_root = os.path.join(output_root, "annotations") # initialize train and val dict train_content = { "images": [], # {"file_name": "09780.jpg", "height": 334, "width": 500, "id": 9780} "annotations": [],# {"area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 5, "id": 1, "segmentation": []} "categories": [] # {"supercategory": "none", "id": 1, "name": "liner"} } val_content = { "images": [], # {"file_name": "09780.jpg", "height": 334, "width": 500, "id": 9780} "annotations": [],# {"area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 5, "id": 1, "segmentation": []} "categories": [] # {"supercategory": "none", "id": 1, "name": "liner"} } train_json = 'instances_train2017.json' val_json = 'instances_val2017.json' # divide the trainset and valset images = os.listdir(image_root) total_num = len(images) train_percent = train_percent train_num = int(total_num * train_percent) train_file = sorted(random.sample(images, train_num)) if mkdir(output_root): if mkdir(train_root) and mkdir(val_root) and mkdir(ann_root): idx1, idx2, dx1, dx2 = 0, 0, 0, 0 for file in tqdm(images): name=os.path.splitext(os.path.basename(file))[0] if name not in errorId: res = readxml(os.path.join(xml_root, name + '.xml')) if file in train_file: idx1 += 1 sh.copy(os.path.join(image_root, file), train_root) train_content['images'].append( {"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx1}) for b in res[1]: dx1 += 1 x = b[0] y = b[1] w = b[2] - b[0] h = b[3] - b[1] train_content['annotations'].append( {"area": w * h, "iscrowd": 0, "image_id": idx1, "bbox": [x, y, w, h], "category_id": classes[b[4]], "id": dx1, "segmentation": []}) else: idx2 += 1 sh.copy(os.path.join(image_root, file), val_root) val_content['images'].append( {"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx2}) for b in res[1]: dx2 += 1 x = b[0] y = b[1] w = b[2] - b[0] h = b[3] - b[1] val_content['annotations'].append( {"area": w * h, "iscrowd": 0, "image_id": idx2, "bbox": [x, y, w, h], "category_id": classes[b[4]], "id": dx2, "segmentation": []}) for i, j in classes.items(): train_content['categories'].append({"supercategory": "none", "id": j, "name": i}) val_content['categories'].append({"supercategory": "none", "id": j, "name": i}) with open(os.path.join(ann_root, train_json), 'w') as f: json.dump(train_content, f) with open(os.path.join(ann_root, val_json), 'w') as f: json.dump(val_content, f) print("Number of Train Images:", len(os.listdir(train_root))) print("Number of Val Images:", len(os.listdir(val_root))) def test(): box_root = "E:/MyProject/Dataset/hwtest/annotations" #xml文件夹 image_root = "E:/MyProject/Dataset/hwtest/images" #image文件夹 output_root = "E:/MyProject/Dataset/coco" #输出文件夹 classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6} #类别字典 errorId = [] #脏数据id train_percent = 0.9 #训练集和验证集比例 tococo(box_root, image_root, output_root,classes=classes,errorId=errorId,train_percent=train_percent) if __name__ == "__main__": test()
VOC(Visual Object Classes)数据集来源于PASCAL VOC挑战赛,其主要任务有Object Classification
、Object Detection
、Object Segmentation
、Human Layout
、Action Classification
。其官网为The PASCAL Visual Object Classes Homepage (ox.ac.uk)。其主要数据集有VOC2007以及VOC2012。
VOC数据集主要包含图像(jpg或者png等等)和标注文件(xml),其数据集格式如下(/
代表文件夹):
-VOC/ |-JPEGImages/ |-1.jpg |-2.jpg |-Annotations/ |-1.xml |-2.xml |-ImageSets/ |-Layout/ |-*.txt |-Main/ |-train.txt |-val.txt |-trainval.txt |-test.txt |-Segmentation/ |-*.txt |-Action/ |-*.txt |-SegmentationClass/ |-SegmentationObject/
其中对于目标检测任务而言,最常用的以及必须的文件夹包括:JPEGImages
,Annotations
,ImageSets/Main
。
JPEGImages
里存放的是图像,而Annotations
里存放的是xml
标注文件,文件内容如下:
<annotation> <folder>VOC</folder> # 图像所在文件夹 <filename>000032.jpg</filename> # 图像文件名 <source> # 图像源 <database>The VOC Database</database> <annotation>PASCAL VOC</annotation> <image>flickr</image> </source> <size> # 图像尺寸信息 <width>500</width> # 图像宽度 <height>281</height> # 图像高度 <depth>3</depth> # 图像通道数 </size> <segmented>0</segmented> # 图像是否用于分割,0代表不适用,对目标检测而言没关系 <object> # 一个目标对象的信息 <name>aeroplane</name> # 目标的类别名 <pose>Frontal</pose> # 拍摄角度,若无一般为Unspecified <truncated>0</truncated> # 是否被截断,0表示完整未截断 <difficult>0</difficult> # 是否难以识别,0表示不难识别 <bndbox> # 边界框信息 <xmin>104</xmin> # 左上角x <ymin>78</ymin> # 左上角y <xmax>375</xmax> # 右下角x <ymax>183</ymax> # 右下角y </bndbox> </object> # 下面是其他目标的信息,这里略掉 <object> 其他object信息,这里省略 </object> </annotation>
下面这个脚本,只适用于有图像和xml文件的情况下,coco转voc格式以后有需要再写:
# -*- coding: utf-8 -*- # @Author : justlovesmile # @Date : 2021/9/8 21:01 import os,random from tqdm.auto import tqdm import shutil as sh def mkdir(path): if not os.path.exists(path): os.mkdir(path) return True else: print(f"The path ({path}) already exists.") return False def tovoc(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9): ''' 参数: root:数据集存放根目录 功能: 加载数据,并保存为VOC格式 加载后的格式: VOC/ Annotations/ - **.xml JPEGImages/ - **.jpg ImageSets/ Main/ - train.txt - test.txt - val.txt - trainval.txt ''' # assert assert len(classes)>0 # init path VOC = saveroot ann_path = os.path.join(VOC, 'Annotations') img_path = os.path.join(VOC,'JPEGImages') set_path = os.path.join(VOC,'ImageSets') txt_path = os.path.join(set_path,'Main') # mkdirs if mkdir(VOC): if mkdir(ann_path) and mkdir(img_path) and mkdir(set_path): mkdir(txt_path) images = os.listdir(imgroot) list_index = range(len(images)) #test and trainval set trainval_percent = tvp train_percent = trp val_percent = 1 - train_percent if train_percent<1 else 0.1 total_num = len(images) trainval_num = int(total_num*trainval_percent) train_num = int(trainval_num*train_percent) val_num = int(trainval_num*val_percent) if train_percent<1 else 0 trainval = random.sample(list_index,trainval_num) train = random.sample(list_index,train_num) val = random.sample(list_index,val_num) for i in tqdm(list_index): imgfile = images[i] img_id = os.path.splitext(os.path.basename(imgfile))[0] xmlfile = img_id+".xml" sh.copy(os.path.join(imgroot,imgfile),os.path.join(img_path,imgfile)) sh.copy(os.path.join(xmlroot,xmlfile),os.path.join(ann_path,xmlfile)) if img_id not in errorId: if i in trainval: with open(os.path.join(txt_path,'trainval.txt'),'a') as f: f.write(img_id+'\n') if i in train: with open(os.path.join(txt_path,'train.txt'),'a') as f: f.write(img_id+'\n') else: with open(os.path.join(txt_path,'val.txt'),'a') as f: f.write(img_id+'\n') if train_percent==1 and i in val: with open(os.path.join(txt_path,'val.txt'),'a') as f: f.write(img_id+'\n') else: with open(os.path.join(txt_path,'test.txt'),'a') as f: f.write(img_id+'\n') # end print("Dataset to VOC format finished!") def test(): box_root = "E:/MyProject/Dataset/hwtest/annotations" image_root = "E:/MyProject/Dataset/hwtest/images" output_root = "E:/MyProject/Dataset/voc" classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6} errorId = [] train_percent = 0.9 tovoc(box_root,image_root,output_root,errorId,classes,trp=train_percent) if __name__ == "__main__": test()
YOLO
数据集格式的出现主要是为了训练YOLO
模型,其文件格式没有固定的要求,因为可以通过修改模型的配置文件进行数据加载,唯一需要注意的是YOLO
数据集的标注格式是将目标框的位置信息进行归一化处理(此处归一化指的是除以图片宽和高),如下所示:
{目标类别} {归一化后的目标中心点x坐标} {归一化后的目标中心点y坐标} {归一化后的目标框宽度w} {归一化后的目标框高度h}
Python
转换脚本如下所示:
# -*- coding: utf-8 -*- # @Author : justlovesmile # @Date : 2021/9/8 20:28 import os import random from tqdm.auto import tqdm import shutil as sh try: import xml.etree.cElementTree as et except ImportError: import xml.etree.ElementTree as et def mkdir(path): if not os.path.exists(path): os.makedirs(path) return True else: print(f"The path ({path}) already exists.") return False def xml2yolo(xmlpath,savepath,classes={}): namemap = classes #try: # with open('classes_yolo.json','r') as f: # namemap=json.load(f) #except: # pass rt = et.parse(xmlpath).getroot() w = int(rt.find("size").find("width").text) h = int(rt.find("size").find("height").text) with open(savepath, "w") as f: for obj in rt.findall("object"): name = obj.find("name").text xmin = int(obj.find("bndbox").find("xmin").text) ymin = int(obj.find("bndbox").find("ymin").text) xmax = int(obj.find("bndbox").find("xmax").text) ymax = int(obj.find("bndbox").find("ymax").text) f.write( f"{namemap[name]} {(xmin+xmax)/w/2.} {(ymin+ymax)/h/2.} {(xmax-xmin)/w} {(ymax-ymin)/h}" + "\n" ) def trainval(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9): # assert assert tvp<=1.0 and trp <=1.0 and len(classes)>0 # create dirs imglabel = ['images','labels'] trainvaltest = ['train','val','test'] mkdir(saveroot) for r in imglabel: mkdir(os.path.join(saveroot,r)) for s in trainvaltest: mkdir(os.path.join(saveroot,r,s)) #train / val trainval_percent = tvp train_percent = trp val_percent = 1 - train_percent if train_percent<1.0 else 0.15 total_img = os.listdir(imgroot) num = len(total_img) list_index = range(num) tv = int(num * trainval_percent) tr = int(tv * train_percent) va = int(tv * val_percent) trainval = random.sample(list_index, tv) # trainset and valset train = random.sample(trainval, tr) # trainset val = random.sample(trainval, va) #valset, use it only when train_percent = 1 print(f"trainval_percent:{trainval_percent},train_percent:{train_percent},val_percent:{val_percent}") for i in tqdm(list_index): name = total_img[i] op = os.path.join(imgroot,name) file_id = os.path.splitext(os.path.basename(name))[0] if file_id not in errorId: xmlp = os.path.join(xmlroot,file_id+'.xml') if i in trainval: # trainset and valset if i in train: sp = os.path.join(saveroot,"images","train",name) xml2yolo(xmlp,os.path.join(saveroot,"labels","train",file_id+'.txt'),classes) sh.copy(op,sp) else: sp = os.path.join(saveroot,"images","val",name) xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id+'.txt'),classes) sh.copy(op,sp) if (train_percent==1.0 and i in val): sp = os.path.join(saveroot,"images","val",name) xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id+'.txt'),classes) sh.copy(op,sp) else: # testset sp = os.path.join(saveroot,"images","test",name) xml2yolo(xmlp,os.path.join(saveroot,"labels","test",file_id+'.txt'),classes) sh.copy(op,sp) def maketxt(dir,saveroot,filename): savetxt = os.path.join(saveroot,filename) with open(savetxt,'w') as f: for i in tqdm(os.listdir(dir)): f.write(os.path.join(dir,i)+'\n') def toyolo(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1,train_percent=0.9): # toyolo main function trainval(xmlroot,imgroot,saveroot,errorId,classes,tvp,train_percent) maketxt(os.path.join(saveroot,"images","train"),saveroot,"train.txt") maketxt(os.path.join(saveroot,"images","val"),saveroot,"val.txt") maketxt(os.path.join(saveroot,"images","test"),saveroot,"test.txt") print("Dataset to yolo format success.") def test(): box_root = "E:/MyProject/Dataset/hwtest/annotations" image_root = "E:/MyProject/Dataset/hwtest/images" output_root = "E:/MyProject/Dataset/yolo" classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6} errorId = [] train_percent = 0.9 toyolo(box_root,image_root,output_root,errorId,classes,train_percent=train_percent) if __name__ == "__main__": test()
按照此脚本,将会在输出文件夹中生成以下内容:
-yolo/ |-images/ |-train/ |-1.jpg |-2.jpg |-test/ |-3.jpg |-4.jpg |-val/ |-5.jpg |-6.jpg |-labels/ |-train/ |-1.txt |-2.txt |-test/ |-3.txt |-4.txt |-val/ |-5.txt |-6.txt |-train.txt |-test.txt |-val.txt