代码如下:
import csv import os import re import parsel import requests headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko)'} def baidu_img(keword, num): base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word={}'.format(keword) path1 = r"E:\图片\\" + keword y = os.path.exists(path1) if y == 0: os.mkdir(path1) else: pass response = requests.get(base_url, headers=headers) html_str = response.text # html = parsel.Selector(html_str) # img_href = html.xpath('//li/div/a/img/@src').extract() #利用xpath提取图片路径 pic_url = re.findall('"objURL":"(.*?)",', html_str, re.S) #利用正则表达式找到图片url # print(pic_url) n = 0 for i in pic_url: try: img = requests.get(i, headers=headers).content img_name = i.split('=')[-1] with open(path1 + '\\' + img_name + '.jpg', 'wb')as f: f.write(img) n = n + 1 with open(path1 + '.csv', 'a', newline='')as ff: csvwriter = csv.writer(ff, dialect='excel') csvwriter.writerow([img_name, i]) if n >= num: break except Exception as e: print(e) if __name__ == '__main__': baidu_img('狗', 20)
如下图所示,20张狗狗图片已经爬取到我们的磁盘中来了
代码如下:
import hashlib import numpy as np import requests from PIL import Image def md5(dirName): # files_path = r"C:\Users\86136\Desktop\output result\百度图片\美女" files = os.listdir(dirName) # 遍历文件夹下的所有文件 temp = set() # 创建一个set() count = 0 # 删除的文件计数 for file in files: if file.lower().endswith(('jpg', 'jpeg', 'png')): file_path = os.path.join(dirName, file) # 获得完整的路径 try: img = Image.open(file_path) # 打开图片 img_array = np.array(img) # 转为数组 md5 = hashlib.md5() # 创建一个hash对象 md5.update(img_array) # 获得当前文件的md5码 if md5.hexdigest() not in temp: # 如果当前的md5码不在集合中 temp.add(md5.hexdigest()) # 则添加当前md5码到集合中 else: os.remove(file_path) count += 1 # 否则删除图片数加一 except Exception as e: os.remove(file_path) print("duplicate removal:", count) # 最后输出删除图片的总数
验证结果:
以上是有重复图片的样本
执行代码后:
那些相同的图片已经被删掉了,图片数量又恢复到了20张,图片去重验证成功。
不多说,代码如下:
def rename(dirName): for root, files in os.walk(dirName): print(root,files) i = 0 for file_name in files: if file_name.lower().endswith(('jpg', 'jpeg', 'png')): oldname = os.path.join(root, file_name) pic_format = os.path.splitext(oldname)[-1] name = 'hivision_buiing_' + root.split('\\')[-1] print(name) newname = root + '/' + name + '_' + str(i + 1).zfill(4) + pic_format i = i + 1 print(newname) try: os.rename(oldname, newname) except Exception as f: print(f)
重命名验证截图:
可以看出小狗狗的名字就整齐归一了
代码如下:
def get_dirs_num(self): dict = {} f = xlwt.Workbook() sheet1 = f.add_sheet(u'统计文件数量', cell_overwrite_ok=True) row = 0 row0 = ['文件夹路径', '文件夹名称', '文件数量'] for n in range(len(row0)): sheet1.write(0, n, row0[n]) path = r'E:\图片' for root, dirs, files in os.walk(path): num = 0 for dir_files in os.listdir(root): if os.path.isfile(os.path.join(root, dir_files)): if dir_files.lower().endswith(('.jpg', 'jpeg', 'png')): num = num + 1 name = root.replace(path + '\\', '') dict[name] = num for key, values in dict.items(): col = 0 if values > 0: sheet1.write(row + 1, col, os.path.join(path, key)) col = col + 1 name_list = key.split('\\') row = row + 1 sheet1.write(row, col, name_list) col = col + 1 sheet1.write(row, col, values) f.save(r'E:\图片\\count.xls')
完成统计,结果如下:
完整代码统一整理如下:
import csv import hashlib import os from tkinter import * import numpy as np import requests from PIL import Image import xlwt headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko)'} class image(): def baidu_img(self, keword, num): base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word={}'.format(keword) path1 = r"E:\图片\\" + keword y = os.path.exists(path1) if y == 0: os.mkdir(path1) else: pass response = requests.get(base_url, headers=headers) html_str = response.text # html = parsel.Selector(html_str) # img_href = html.xpath('//li/div/a/img/@src').extract() #利用xpath提取图片路径 pic_url = re.findall('"objURL":"(.*?)",', html_str, re.S) # 利用正则表达式找到图片url # print(pic_url) n = 0 for i in pic_url: try: img = requests.get(i, headers=headers).content img_name = i.split('=')[-1] with open(path1 + '\\' + img_name + '.jpg', 'wb')as f: f.write(img) n = n + 1 with open(path1 + '.csv', 'a', newline='')as ff: csvwriter = csv.writer(ff, dialect='excel') csvwriter.writerow([img_name, i]) if n >= num: break except Exception as e: print(e) def md5(self, dirName): files = os.listdir(dirName) # 遍历文件夹下的所有文件 temp = set() # 创建一个set() count = 0 # 删除的文件计数 for file in files: if file.lower().endswith(('jpg', 'jpeg', 'png')): file_path = os.path.join(dirName, file) # 获得完整的路径 try: img = Image.open(file_path) # 打开图片 img_array = np.array(img) # 转为数组 md5 = hashlib.md5() # 创建一个hash对象 md5.update(img_array) # 获得当前文件的md5码 if md5.hexdigest() not in temp: # 如果当前的md5码不在集合中 temp.add(md5.hexdigest()) # 则添加当前md5码到集合中 else: os.remove(file_path) count += 1 # 否则删除图片数加一 except Exception as e: os.remove(file_path) print("duplicate removal:", count) # 最后输出删除图片的总数 def rename(self, dirName): for root, dirs, files in os.walk(dirName): i = 0 for file_name in files: if file_name.lower().endswith(('jpg', 'jpeg', 'png')): oldname = os.path.join(root, file_name) pic_format = os.path.splitext(oldname)[-1] name = 'hivision_buiing_' + root.split('\\')[-1] print(name) newname = root + '/' + name + '_' + str(i + 1).zfill(4) + pic_format i = i + 1 print(newname) try: os.rename(oldname, newname) except Exception as f: print(f) def get_dirs_num(self): dict = {} f = xlwt.Workbook() sheet1 = f.add_sheet(u'统计文件数量', cell_overwrite_ok=True) row = 0 row0 = ['文件夹路径', '文件夹名称', '文件数量'] for n in range(len(row0)): sheet1.write(0, n, row0[n]) path = r'E:\图片' for root, dirs, files in os.walk(path): num = 0 for dir_files in os.listdir(root): if os.path.isfile(os.path.join(root, dir_files)): if dir_files.lower().endswith(('.jpg', 'jpeg', 'png')): num = num + 1 name = root.replace(path + '\\', '') dict[name] = num for key, values in dict.items(): col = 0 if values > 0: sheet1.write(row + 1, col, os.path.join(path, key)) col = col + 1 name_list = key.split('\\') row = row + 1 sheet1.write(row, col, name_list) col = col + 1 sheet1.write(row, col, values) f.save(r'E:\图片\\count.xls') if __name__ == '__main__': keyword = '狗' dirName = r"E:\图片\\{}".format(keyword) num =20 test = image() test.baidu_img(keyword, num) test.md5(dirName) test.rename(dirName) test.get_dirs_num()