第一步是制作自己的数据集(照片),可以是网络找的,也可以是自己拍的,甚至可以是自己p的。
以我下面讲解的数据集为例子,我是在网上找的关于的苹果的病虫害,我简单的做了三个分类,分别是Alternaria_Boltch(斑点落叶病)、Grey_spot(灰斑病)、Rust( 锈病)。我的文件结构如下:
每个文件下放着我的数据集照片:
就不一一展示了,反正就是有几个类就创几个文件夹,再把各个类别的照片放进对应的文件夹中,这样我们的数据集就初步制作完毕了。
这一步是很重要的一步。现在市面上常用的打标签工具有labelme和labelimg两种,我这里就以labelimg为例,介绍一下打标签的过程。
首先,如果没装过labelimg的朋友需要先装一下labelimg插件。步骤是先打开Anaconda Prompt(直接用cmd也行),选择一下把插件装在哪个环境:
指令:activate 环境名
也可以自己用Anaconda自己创一个环境,具体创环境步骤我就不写了,网上好多教程,可以学下。
然后就安装labelimg插件:
指令:pip install labelimg
这里显示我装过了,反正输入指令一般是可以装成功的。
再打开labelimg
指令:labelimg
之后系统就会打开labelimg插件,插件GUI界面如下:
注意:打开labelimg的时候,你的Anaconda Prompt/cmd不能叉掉,必须同时运行
再下面点击 Open Dir 打开自己的数据集某个类别的文件夹
如图我选择打开我的第一个分类Alternaria_Boltch,显示:
注意:在切换到下一个类别的时候,别忘了把右上角的默认标签也改成相应的标签名,不然就会发现白打了…
OK,打标签的步骤就说到这里,做到这里你应该把你的所有类别文件下的所有图片都打好了标签,并在同一个文件下生成了所有图片对应的xml标签文件,如下图:
以上是我打的三个类别(文件)下的所有图片和他们一一对应的标签(xml)。
我们应该都很熟悉voc的文件格式的,VOC的目录格式如下:
-VOC
现在我们将自己的数据集制作成VOC的格式。
把自己的种类排一个序(想怎么排怎么排),写成如下字典的格式:
这里我写了一个移动文件的脚本,就是原本xml和image混在一起放在各个类别文件下的,现在我将他们全部区分开,将所有的xml放入Annotations文件中,将所有的图片放入Images文件中。需要使用的话只需注意检测代码中的全局地址变量要改成你电脑的地址就可以执行了。
move_image_xml.py
""" 这个脚本是将打完标签的图片和xml文件以voc的格式存储起来 运行结果最终生成Annotations和Images两个文件 """ import os import shutil import json # --------------------------全局地址变量--------------------------------# xml_save_root = "F:\yolov3-my\data\ApplePest\Annotations" # Annotations的地址 images_save_root = "F:\yolov3-my\data\ApplePest\Images" # Images的地址 class_path = "F:\yolov3-my\data\\apple_pest_classes.json" # classes.json文件的地址 data_path = "F:\dataset" # 你打标签的源地址 它的子目录就是你的所有类别文件(存放xml和jpg) assert os.path.exists(class_path), "class_path not exist!" assert os.path.exists(data_path), "data_path not exist!" if not os.path.exists(xml_save_root): os.makedirs(xml_save_root) if not os.path.exists(images_save_root): os.makedirs(images_save_root) # --------------------------全局变量--------------------------------# def move_image_xml(cla_path, xml_root, images_root, data_path): class_path = cla_path with open(class_path) as f: # 读取label/json文件 json_list = json.load(f) labels = list(json_list.keys()) for i in range(len(labels)): open_root = os.path.join(data_path, labels[i]) xml_save_root = xml_root images_save_root = images_root for file_full_name in os.listdir(open_root): file_name, file_type = os.path.splitext(file_full_name)[0], os.path.splitext(file_full_name)[1] if file_type == '.xml': open_path = os.path.join(open_root, file_full_name) save_path = os.path.join(xml_save_root, file_full_name) shutil.move(open_path, save_path) if file_type == '.jpg': open_path = os.path.join(open_root, file_full_name) save_path = os.path.join(images_save_root, file_full_name) shutil.move(open_path, save_path) if __name__ == '__main__': move_image_xml(class_path, xml_save_root, images_save_root, data_path)
执行完,所有类别文件对应的文件清空:
在指定xml_save_root 、images_save_root 位置生成了两个文件,分别是Annotations(存放所有种类的xml)和Images(存放所有种类的图片):
生成ImagesSet的过程其实就是划分训练集和验证集(可能还要测试集)的过程。
下面是我写的一个脚本,可以从Annotations中随机划分训练集和测试集,最终生成ImagesSet/train.txt和val.txt。
split_train_val.py
"""这个脚本从Annotations中随机划分训练集和测试集,最终生成ImagesSet/train.txt和val.txt""" import os import random from os.path import * # --------------------------全局地址变量--------------------------------# dir_path = dirname(dirname(abspath(__file__))) xml_path = os.path.join(dir_path, "ApplePest", "Annotations") assert os.path.exists(xml_path), "xml_path not exist!" ImageSets_path = os.path.join(dir_path, "ApplePest", "ImageSets") if not os.path.exists(ImageSets_path): os.makedirs(ImageSets_path) traintxt_path = os.path.join(dir_path, "ApplePest", "ImageSets", "train.txt") valtxt_path = os.path.join(dir_path, "ApplePest", "ImageSets", "val.txt") if os.path.exists(traintxt_path): os.remove(traintxt_path) if os.path.exists(valtxt_path): os.remove(valtxt_path) # --------------------------全局地址变量--------------------------------# def create_imagesets(xml_full_path, traintxt_full_path, valtxt_full_path): train_percent = 0.8 # 需要改变比例就改这里 val_percent = 0.2 # test_percent = 0.1 xml_path = xml_full_path total_xml = os.listdir(xml_path) num = len(total_xml) lists = list(range(num)) num_train = int(num * train_percent) train_list = random.sample(lists, num_train) for i in train_list: lists.remove(i) val_list = lists ftrain = open(traintxt_full_path, 'w') fval = open(valtxt_full_path, 'w') for i in range(num): name = total_xml[i][:-4] + '\n' if i in train_list: ftrain.write(name) else: fval.write(name) ftrain.close() fval.close() if __name__ == '__main__': create_imagesets(xml_path, traintxt_path, valtxt_path)
效果如下:
voc2yolo.py
""" 本脚本有两个功能: 1.根据train.txt和val.txt将voc数据集标注信息(.xml)转为yolo标注格式(.txt),生成dataset文件(train+val) 2.根据json标签文件,生成对应names标签(my_data_label.names) """ import os from tqdm import tqdm from lxml import etree import json import shutil from os.path import * # --------------------------全局地址变量--------------------------------# # 拼接出voc的images目录,xml目录,txt目录 dir_path = dirname(dirname(abspath(__file__))) images_path = os.path.join(dir_path, "ApplePest", "images") xml_path = os.path.join(dir_path, "ApplePest", "Annotations") train_txt_path = os.path.join(dir_path, "ApplePest", "ImageSets", "train.txt") val_txt_path = os.path.join(dir_path, "ApplePest", "ImageSets", "val.txt") # label标签对应json文件 label_json_path = os.path.join(dir_path, "apple_pest_classes.json") save_file_root = os.path.join(dir_path, "dataset") # 检查文件/文件夹都是否存在 assert os.path.exists(images_path), "images path not exist..." assert os.path.exists(xml_path), "xml path not exist..." assert os.path.exists(train_txt_path), "train txt file not exist..." assert os.path.exists(val_txt_path), "val txt file not exist..." assert os.path.exists(label_json_path), "label_json_path does not exist..." if os.path.exists(save_file_root) is False: os.makedirs(save_file_root) # --------------------------全局地址变量--------------------------------# def parse_xml_to_dict(xml): """ 将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict Args: xml: xml tree obtained by parsing XML file contents using lxml.etree Returns: Python dictionary holding XML contents. """ if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息 return {xml.tag: xml.text} result = {} for child in xml: child_result = parse_xml_to_dict(child) # 递归遍历标签信息 if child.tag != 'object': result[child.tag] = child_result[child.tag] else: if child.tag not in result: # 因为object可能有多个,所以需要放入列表里 result[child.tag] = [] result[child.tag].append(child_result[child.tag]) return {xml.tag: result} def translate_info(file_names: list, save_root: str, class_dict: dict, train_val='train'): """ 将对应xml文件信息转为yolo中使用的txt文件信息 :param file_names: :param save_root: :param class_dict: :param train_val: :return: """ save_txt_path = os.path.join(save_root, train_val, "labels") if os.path.exists(save_txt_path) is False: os.makedirs(save_txt_path) save_images_path = os.path.join(save_root, train_val, "images") if os.path.exists(save_images_path) is False: os.makedirs(save_images_path) for file in tqdm(file_names, desc="translate {} file...".format(train_val)): # 检查下图像文件是否存在 img_path = os.path.join(images_path, file + ".jpg") assert os.path.exists(img_path), "file:{} not exist...".format(img_path) # 检查xml文件是否存在 xml_full_path = os.path.join(xml_path, file + ".xml") assert os.path.exists(xml_full_path), "file:{} not exist...".format(xml_full_path) # read xml with open(xml_full_path) as fid: xml_str = fid.read() xml = etree.fromstring(xml_str) data = parse_xml_to_dict(xml)["annotation"] img_height = int(data["size"]["height"]) img_width = int(data["size"]["width"]) # write object info into txt with open(os.path.join(save_txt_path, file + ".txt"), "w") as f: assert "object" in data.keys(), "file: '{}' lack of object key.".format(xml_full_path) for index, obj in enumerate(data["object"]): # 获取每个object的box信息 xmin = float(obj["bndbox"]["xmin"]) xmax = float(obj["bndbox"]["xmax"]) ymin = float(obj["bndbox"]["ymin"]) ymax = float(obj["bndbox"]["ymax"]) class_name = obj["name"] class_index = class_dict[class_name] - 1 # 目标id从0开始 # 将box信息转换到yolo格式 xcenter = xmin + (xmax - xmin) / 2 ycenter = ymin + (ymax - ymin) / 2 w = xmax - xmin h = ymax - ymin # 绝对坐标转相对坐标,保存6位小数 xcenter = round(xcenter / img_width, 6) ycenter = round(ycenter / img_height, 6) w = round(w / img_width, 6) h = round(h / img_height, 6) info = [str(i) for i in [class_index, xcenter, ycenter, w, h]] if index == 0: f.write(" ".join(info)) else: f.write("\n" + " ".join(info)) # copy image into save_images_path shutil.copyfile(img_path, os.path.join(save_images_path, img_path.split(os.sep)[-1])) def create_class_names(class_dict: dict): keys = class_dict.keys() with open("../dataset_classes.names", "w") as w: for index, k in enumerate(keys): if index + 1 == len(keys): w.write(k) else: w.write(k + "\n") def main(): # read class_indict json_file = open(label_json_path, 'r') class_dict = json.load(json_file) # 读取train.txt中的所有行信息,删除空行 with open(train_txt_path, "r") as r: train_file_names = [i for i in r.read().splitlines() if len(i.strip()) > 0] # voc信息转yolo,并将图像文件复制到相应文件夹 translate_info(train_file_names, save_file_root, class_dict, "train") # 读取val.txt中的所有行信息,删除空行 with open(val_txt_path, "r") as r: val_file_names = [i for i in r.read().splitlines() if len(i.strip()) > 0] # voc信息转yolo,并将图像文件复制到相应文件夹 translate_info(val_file_names, save_file_root, class_dict, "val") # 创建my_data_label.names文件 create_class_names(class_dict) if __name__ == "__main__": main()
生成yolo格式的数据集dataset,并分train和val存放,分别存储images和labels,存放划分好的训练集的图片和标签,并生成dataset_classes.names文件,效果如下:
calculate_dataset.py
""" 1.统计训练集和验证集的数据并生成相应train_path.txt和val_path.txt文件 2.创建data.data文件,记录classes个数, train以及val数据集文件(.txt)路径和label.names文件路径 """ import os from os.path import * dir_path = dirname(dirname(abspath(__file__))) train_annotation_dir = os.path.join(dir_path, "dataset", "train", "labels") val_annotation_dir = os.path.join(dir_path, "dataset", "val", "labels") classes_label = os.path.join(dir_path, "dataset_classes.names") assert os.path.exists(train_annotation_dir), "train_annotation_dir not exist!" assert os.path.exists(val_annotation_dir), "val_annotation_dir not exist!" assert os.path.exists(classes_label), "classes_label not exist!" # 保存地址 train_path_txt = os.path.join(dir_path, "train_path.txt") val_path_txt = os.path.join(dir_path, "val_path.txt") dataset_data = os.path.join(dir_path, "dataset.data") def calculate_data_txt(txt_path, dataset_dir): # create my_data.txt file that record image list with open(txt_path, "w") as w: for file_name in os.listdir(dataset_dir): if file_name == "classes.txt": continue img_path = os.path.join(dataset_dir.replace("labels", "images"), file_name.split(".")[0]) + ".jpg" line = img_path + "\n" assert os.path.exists(img_path), "file:{} not exist!".format(img_path) w.write(line) def create_dataset_data(create_data_path, label_path, train_path, val_path, classes_info): # create my_data.data file that record classes, train, valid and names info. # shutil.copyfile(label_path, "./data/my_data_label.names") with open(create_data_path, "w") as w: w.write("classes={}".format(len(classes_info)) + "\n") # 记录类别个数 w.write("train={}".format(train_path) + "\n") # 记录训练集对应txt文件路径 w.write("valid={}".format(val_path) + "\n") # 记录验证集对应txt文件路径 w.write("names={}".format(classes_label) + "\n") # 记录label.names文件路径 def main(): # 统计训练集和验证集的数据并生成相应txt文件 calculate_data_txt(train_path_txt, train_annotation_dir) calculate_data_txt(val_path_txt, val_annotation_dir) classes_info = [line.strip() for line in open(classes_label, "r").readlines() if len(line.strip()) > 0] # dataset.data文件,记录classes个数, train以及val数据集文件(.txt)路径和label.names文件路径 create_dataset_data(dataset_data, classes_label, train_path_txt, val_path_txt, classes_info) if __name__ == '__main__': main()
生成train_path.txt(存放训练集所有图片的地址)和val_path.txt(存放验证集所有图片的地址),效果如下:
并生成dataset.data(存放数据集的一些有用的信息:类别数量,训练文件train.txt的地址,训练文件val.txt的地址,dataset_classes.names的地址等),效果如下:
到这里我们项目关于数据集的制作与格式处理就全部完成了,数据集的文件目录如下: