使用python写一个数据清洗器

前言

对于数据科学工作者来说，整理数据为样本分类是必不可少的工作，然而用传统的复制、粘贴来移动图片或者一张一张打开图片检查的方式效率太低，所以本人使用python（ui基于tkinter，文件io基于os和shutil，图像处理基于PIL和numpy）写了一个数据分类器，可以有效提高数据整理工作的效率。

效果展示

打开脚本时，会弹出文件夹选择框，此时需要选择样本集所在的文件夹
请添加图片描述

正式的工作界面如下图所示，具有的功能有：
在这里插入图片描述

①可以重新导入其他样本
②可以显示图片，同时图片上方有当期图片索引和样本总数的提示
③可以将该样本移动到正样本、负样本或者直接删除
④使用鼠标滚轮切换图片，向下滚动则切换到下一张，反之则切换到上一张。
⑤可以通过改变索引值，浏览任意索引的图片。

通过以上功能可以快速地对你的样本进行分类，同时，当你获得的数据质量较差的时候，比如有些正样本被标成了负样本，有些负样本被标成了正样本，那么可以使用这个脚本快速地对样本进行再分类，从而实现清洗的作用。

源代码

import shutil
import os
from tkinter import *
from tkinter import filedialog
from PIL import Image, ImageTk
import numpy as np
import time

class Cleaner(Tk):
    def __init__(self,Height=500,Width=650):
        super().__init__()
        #正样本目录和负样本目录
        self.pos_dir = 'D:\\核聚变课题组\\样本图片\\正样本'
        self.neg_dir = 'D:\\核聚变课题组\\样本图片\\负样本'
        #保存样本集目录列表和当前图片的指针索引，以及当前工作目录
        self.imgs = []
        self.img_idx  = 0     #从第一张开始检查
        self.work_dir = None
        #窗口尺寸
        self.Height = Height
        self.Width  = Width
        #设计标题
        self.title('数据清洗器')
        #调整大小，同时不能让用户调整尺寸
        Height = 500
        Width  = 650
        self.geometry('{}x{}'.format(self.Width,self.Height))
        self.resizable(0,0)
        #创建一系列按钮并摆放好位置
        self.go_pos = Button(self, text="转移到正样本",command=self.go2pos)
        self.go_neg = Button(self, text="转移到负样本",command=self.go2neg)
        self.go_bin = Button(self, text="删除该样本",command=self.go2bin)
        self.go_dir = Button(self, text="导入样本",command=self.get_imgs)
        self.go_pos.place(x=self.Width//5+10, y=self.Height//5*4+10)
        self.go_neg.place(x=self.Width//5+150,y=self.Height//5*4+10)
        self.go_bin.place(x=self.Width//5+300,y=self.Height//5*4+10)
        self.go_dir.place(x=10,y=10)
        #放置标签
        self.img_box = Label(self)
        self.img_box.place(x=120,y=60)
        self.img_box.bind("<MouseWheel>",self.switch_img)
        self.idx_box = Label(self)
        self.idx_box.place(x=self.Height//2+60,y=30)
    def go2pos(self):
        try:
            if len(self.imgs) == 1:
                shutil.move(self.imgs[self.img_idx],self.pos_dir)
                img = Image.fromarray(self.cut_img(np.array(Image.open(r'D:\核聚变课题组\处理数据的脚本\数据清洗器\finish.png'))))
                img = ImageTk.PhotoImage(img.resize((400,320)))
                self.img_box.config(image=img)
                self.idx_box.config(text='0/0')
                self.mainloop()
            else:    
                shutil.move(self.imgs[self.img_idx],self.pos_dir)
                self.imgs = os.listdir(self.work_dir)
                if self.img_idx == len(self.imgs):
                    self.img_idx -= 1
                img = Image.fromarray(self.cut_img(np.array(Image.open(self.imgs[self.img_idx]))))
                img = ImageTk.PhotoImage(img.resize((400,320)))
                self.img_box.config(image=img)
                self.idx_box.config(text='{}/{}'.format(self.img_idx+1,len(self.imgs)))
                self.mainloop()
        except:
            pass

    def go2neg(self):
        try:
            if len(self.imgs) == 1:
                shutil.move(self.imgs[self.img_idx],self.neg_dir)
                img = Image.fromarray(self.cut_img(np.array(Image.open(r'D:\核聚变课题组\处理数据的脚本\数据清洗器\finish.png'))))
                img = ImageTk.PhotoImage(img.resize((400,320)))
                self.img_box.config(image=img)
                self.idx_box.config(text='0/0')
                self.mainloop()
            else:    
                shutil.move(self.imgs[self.img_idx],self.neg_dir)
                self.imgs = os.listdir(self.work_dir)
                if self.img_idx == len(self.imgs):
                    self.img_idx -= 1
                img = Image.fromarray(self.cut_img(np.array(Image.open(self.imgs[self.img_idx]))))
                img = ImageTk.PhotoImage(img.resize((400,320)))
                self.img_box.config(image=img)
                self.idx_box.config(text='{}/{}'.format(self.img_idx+1,len(self.imgs)))
                self.mainloop()
        except:
            pass


    def go2bin(self):
        try:
            if len(self.imgs) == 1:
                os.remove(self.imgs[self.img_idx])
                img = Image.fromarray(self.cut_img(np.array(Image.open(r'D:\核聚变课题组\处理数据的脚本\数据清洗器\finish.png'))))
                img = ImageTk.PhotoImage(img.resize((400,320)))
                self.img_box.config(image=img)
                self.idx_box.config(text='0/0')
                self.mainloop()
            else:    
                os.remove(self.imgs[self.img_idx])
                self.imgs = os.listdir(self.work_dir)
                if self.img_idx == len(self.imgs):
                    self.img_idx -= 1
                img = Image.fromarray(self.cut_img(np.array(Image.open(self.imgs[self.img_idx]))))
                img = ImageTk.PhotoImage(img.resize((400,320)))
                self.img_box.config(image=img)
                self.idx_box.config(text='{}/{}'.format(self.img_idx+1,len(self.imgs)))
                self.mainloop()
        except:
            pass

    def cut_img(self,img_array):
        """基于对数组的切片算法"""
        if img_array.shape[0] ==1584 or img_array.shape[0] ==1581:
            img_array = img_array[196:1420,371:2612]
        if img_array.shape[0] ==2134 or img_array.shape[0] ==2145:
            img_array = img_array[240:1905,500:3524]
        return img_array

    def get_imgs(self):
        """获取样本集地址,并切换到工作目录"""
        self.work_dir = filedialog.askdirectory()
        self.imgs = os.listdir(self.work_dir)
        os.chdir(self.work_dir)
        self.img_idx = 0
        self.show_img(self.imgs[self.img_idx])

    def show_img(self,img_dir):
        """对图片进行格式和大小的调整并且显示出来"""
        img = Image.fromarray(self.cut_img(np.array(Image.open(img_dir))))
        img = ImageTk.PhotoImage(img.resize((400,320)))
        self.img_box.config(image=img)
        self.idx_box.config(text='{}/{}'.format(self.img_idx+1,len(self.imgs)))
        self.mainloop()

    def switch_img(self,event):
        """根据鼠标滚轮事件改变当前指针索引"""
        if event.delta < 0 and self.img_idx < len(self.imgs)-1: #向下滑动
            self.img_idx += 1
        elif event.delta > 0 and self.img_idx > 0:              #向上滑动
            self.img_idx -= 1
        img = Image.fromarray(self.cut_img(np.array(Image.open(self.imgs[self.img_idx]))))
        img = ImageTk.PhotoImage(img.resize((400,320)))
        self.img_box.config(image=img)
        self.idx_box.config(text='{}/{}'.format(self.img_idx+1,len(self.imgs)))
        self.mainloop()

    def run(self):
        self.get_imgs()
        #放在各个函数最后，刷新事件循环，使其变成动态UI
        self.mainloop()


if __name__ == '__main__':
    cleaner = Cleaner()
    cleaner.run()

教程

这个脚本的编程细节有点多，整理起来可能要上万字，本人会慢慢更新。

使用python写一个数据清洗器

前言

效果展示

源代码

教程

悦读