R语言实现数据预处理

### 数据去重 ###
# 对向量进行操作
set.seed(1234)
v <- sample(1:10,15,replace = T)
v
unique(v)

# 对矩阵进行操作
set.seed(1234)
m <- matrix(sample(1:3,12,replace = T),ncol=2)
m
unique(m)

# 对数据框进行操作
set.seed(1234)
df <- data.frame(x1 = sample(1:3,8,replace = T),
                 x2 = sample(letters[1:3],8,replace = T))
df
(a <- unique(df))
if(!require(dplyr)) install.packages("dplyr") #加载dplyr包，如不存在就进行在线安装
(b <- distinct(df))
# 针对某一列进行操作
(c <- unique(df$x1))
class(c)
(d <- distinct(df,x1))
class(d)

### 数据排序 ###
set.seed(1234)
v <- sample(1:10,10)
v
sort(v)
order(v)
which.min(v)
v[order(v)]

set.seed(1234)
df <- data.frame(x = sample(1:10,10,replace = T),
                 y = sample(1:10,10,replace = T),
                 z = sample(1:10,10,replace = T))
df
df[order(df$x,df$y),]
df[order(df$y,df$x,-df$z),]

library(dplyr)
# 先按照x升序排序，x变量值相同时再按照y变量升序排序
arrange(df,x,y) 
# 先按照y变量升序、再按照x变量升序，最后按照z变量降序
arrange(df,y,x,desc(z))  

### 数据筛选 ###
iris[1:6,c(1,3,5)]
iris[iris$Sepal.Length>5.5 & iris$Species=='setosa',]

# subset函数
subset(mtcars,cyl==4,select = mpg:hp)
subset(mtcars,cyl==4,select = 1:4)
subset(mtcars,cyl==4 & am==1,select=c('mpg','cyl','am'))
subset(mtcars,subset = substr(rownames(mtcars),1,4) %in% c('Merc','Fiat'))

# filter函数
# method 1
filter(mtcars,cyl==4)
# method 2
mtcars %>% filter(cyl==4) 
mtcars %>% filter(cyl==4) %>% select(mpg:hp)

### 数据合并 ###
r1 <- matrix(1:6,nrow = 2) # 创建两行三列的矩阵
r2 <- matrix(7:12,nrow = 2) # 创建两行三列的矩阵
r1;r2
rbind(r1,r2) # 行合并
cbind(r1,r2) # 列合并
r3 <- matrix(1:16,nrow = 4) # 创建四行四列的矩阵
r3
rbind(r1,r3)
cbind(r1,r3)

library(dplyr)
r1 <- as.data.frame(r1)
r2 <- as.data.frame(r2)
r4 <- data.frame(V1 = c(1,2),
                 x2 = c(4,5),
                 x3 = c(7,8))
r1;r2;r4

rbind(r1,r2)
bind_rows(r1,r2) 

rbind(r1,r4)
bind_rows(r1,r4)


bind_cols(r1,r2) 
cbind(r1,r2)
bind_cols(r1,r4)
cbind(r1,r4)

### 数据关联 ###
set.seed(1234)
(student <- data.frame(name = c('Emily','Jacob','Emma','Michael','Olivia','Isabella','Daniel'),
                      gender = c('female','male','female','male','female','female','male')))
(english_score <- data.frame(name = c(as.character(sample(student$name,5)),'Tracy'),
                            score = sample(50:100,6)))

merge(student,english_score,by = 'name')           #inner模式
merge(student,english_score,by = 'name',all.x = T) # left模式
merge(student,english_score,by = 'name',all.y = T) # right模式
merge(student,english_score,by = 'name',all = T)   # out模式


(math_score <- data.frame(stu_name = sample(student$name,5),
                          score = sample(60:100,5)))
# 第一步，对student和english_socre表进行关联
df_join <- merge(student,english_score,all = T)
# 第二步，对df_join和math_score表进行关联
df_join1 <- merge(df_join,math_score,by.x = 'name',by.y = 'stu_name',all = T)
df_join1
df_join2 <- merge(df_join,math_score,
                  by.x = 'name',by.y = 'stu_name',all = T,
                  suffixes = c('.english','.math')) #修改后缀名称
df_join2

student %>%
  full_join(english_score,by = 'name') %>%
  full_join(math_score,by = c('name' = "stu_name"),suffix = c('.english','.math'))


student %>%
  merge(english_score,by = 'name',all = T) %>%
  merge(math_score,by.x = 'name',by.y = 'stu_name',
        all = T,suffixes = c('.english','.math'))

### 数据转换 ###
mtcars$vs  #查看vs变量的值
mtcars$vs <- ifelse(mtcars$vs==0,"V-shaped","straight")
mtcars$vs 
mtcars <- transform(mtcars,vs = ifelse(vs=="V-shaped",0,1)) 
mtcars$vs

ncol(mtcars) # 查看列数
mtcars <- transform(mtcars,
                    Engine = ifelse(vs==0,"V","S"),
                    Transmission = ifelse(am==0,"a","m"))
ncol(mtcars)
head(mtcars)

# 移除变量
library(dplyr)
mtcars <- mtcars %>% mutate(
  Engine = NULL,
  Transmission = NULL
)
colnames(mtcars)

head(mtcars %>% mutate(disp_1 = disp / 61.0237))
head(mtcars %>% transmute(disp_1 = disp / 61.0237))

### 融合重铸 ###
set.seed(1234)
df <- data.frame(name = c('Emily','Jacob','Emma'),
                 gender = c('female','male','female'),
                 chinese = sample(50:100,3),
                 english = sample(60:100,3),
                 mathematics = sample(70:100,3))
df

# 将宽数据转换为长数据
# 使用reshape包
df_melt <- reshape::melt(df,id.vars = c('name','gender'),
                         variable_name = 'class')
# 使用reshape2包
df_melt1 <- reshape2::melt(df,id.vars = c('name','gender'),
                           variable.name = 'class',
                           value.name = 'score')
# 使用tidyr包
df_gather <- tidyr::gather(df,key = 'class',
                           value = 'score',-c('name','gender'))
df_gather

# 将长数据转换为宽数据
# 使用reshape包
df_cast <- reshape::cast(df_melt,name+gender~class,value = 'value')
# 使用reshape2包
df_dcast <- reshape2::dcast(df_melt1,name+gender~class,
                            value.var = 'score')
# 使用tidyr包
df_spread <- tidyr::spread(df_gather,'class','score')
df_spread

# 按照性别进行学科成绩求平均
# 使用reshape包
reshape::cast(df_melt,gender ~ class,
              fun.aggregate = mean,value = 'value')
# 使用reshape2包
reshape2::dcast(df_melt1,gender ~ class,
                fun.aggregate = mean,value.var = 'score')


### 数据聚合 ###
set.seed(1234)
(x <- sample(1:10,10))
# 自定义描述统计函数
stat.desc <-function(x){
  list('求和' = sum(x,na.rm = T),
       '累计求和' = cumsum(x),
       '最大值' = max(x,na.rm = T),
       '最小值' =  min(x,na.rm = T),
       '平均值' = mean(x,na.rm = T),
       '中位数' = median(x,na.rm = T),
       '百分位数' = quantile(x,na.rm = T),
       '极差' = range(x)[2] - range(x)[1],
       '四分位距' = IQR(x,na.rm = T),
       '方差' = var(x,na.rm = T),
       '标准差' = sd(x,na.rm = T),
       '变异系数' = sd(x,na.rm = T)/mean(x,na.rm = T))
}
stat.desc(x)

library(dplyr)
mtcars %>%
  summarise(mean = mean(mpg),median = median(mpg),sd = sd(mpg))

mtcars %>%
  summarise_at(vars(mpg,disp),
               list(mean = mean,median = median,sd = sd))

mtcars %>%
  summarise_all(list(mean = mean,median = median))

# 描述统计分析
summary(iris)


### 数据分组 ###
aggregate(mtcars[,c('mpg','disp')],by = list(am = mtcars$am,vs = mtcars$vs),
          FUN = mean)

aggregate(mtcars[,c('mpg','disp')],by = list(am = mtcars$am,vs = mtcars$vs),
          FUN = list(mean,sd))

myfun <- function(x){
  c(mean = mean(x,na.rm = T),
       sd = sd(x,na.rm = T))
}
aggregate(mtcars[,c('mpg','disp')],by = list(am = mtcars$am,vs = mtcars$vs),
          FUN = myfun)

library(dplyr)
mtcars %>%
  group_by(am,vs) %>%
  summarise_at(vars(mpg,disp),list(mean = mean,sd = sd))

### 5.1 数据抽样 ###
# SMOTE抽样
set.seed(1234)
df <- data.frame(x1 = sample(1:100,100),
                 x2 = sample(1:100,100),
                 type = sample(c(rep('a',95),rep('b',5)),100))
head(df) # 查看数据前六行
table(df$type) # 查看类别频数
prop.table(table(df$type)) # 查看类别占比

# DMwR已经不再CRAN官网上，需要通过以下地址https://cran.r-project.org/src/contrib/Archive/DMwR/下载后进行本地安装
library("DMwR") # 加载DMwR包
df$type <- as.factor(df$type)
df_new <- SMOTE(type~.,df,perc.over = 100,perc.under = 200) 
table(df_new$type) 
prop.table(table(df_new$type))

# 随机抽样
set.seed(1234)
(x <- seq(1,10))
(a <- sample(x,8,replace=FALSE)) # 无放回抽样
(b <- sample(x,8,replace=TRUE))  # 有放回抽样
# 当size大于x的长度
sample(x,15,replace = FALSE)
sample(x,15,replace = TRUE)


table(iris$Species) # 查看各类别频数
d <- 1:nrow(iris)   # 提取下标集
index1 <- sample(d[iris$Species=='setosa'],50*0.5)
index2 <- sample(d[iris$Species=='versicolor'],50*0.5)
index3 <- sample(d[iris$Species=='virginica'],50*0.5)
iris_sub <- iris[c(index1,index2,index3),]
table(iris_sub$Species)

# 等比例抽样
# 载入caret包，如果本地未安装就进行在线安装caret包
if(!require(caret)) install.packages("caret")
# 提取下标集
splitindex <- createDataPartition(iris$Species,times=1,p=0.1,list=FALSE)
splitindex
# 提取符合子集
sample <- iris[splitindex,]
# 查看Species变量中各类别的个数和占比
table(sample$Species);
prop.table(table(sample$Species))
# 设置list为TRUE
# 提取下标集
splitindex1 <- createDataPartition(iris$Species,times=1,p=0.1,list=TRUE)
# 查看下标集
splitindex1
# 提取子集
iris[splitindex1$Resample1,]
# 设置times=2
splitindex2 <- createDataPartition(iris$Species,times=2,p=0.1,list=TRUE)
splitindex2

# 利用sample函数构建五折交叉验证的训练集和测试集
# zz1为所有观测值的下标
n <- nrow(iris);zz1 <- 1:n 
# zz2为1:5的随机排列
set.seed(1234)
zz2 <- rep(1:5,ceiling(n/5))[1:n]
zz2 <- sample(zz2,n)
# 构建训练集及测试集
for(i in 1:5){
  m <- zz1[zz2==i]
  train <- iris[-m,]
  test <- iris[m,]
  # 接下来就可以利用训练集建立模型，测试集验证模型，并计算5次MSE
}

# 利用createFolds函数构建五折交叉验证的训练集和测试集
index <- createFolds(iris$Species,k=5,list=FALSE)
prop.table(table(iris[index==1,'Species']))
prop.table(table(iris[index==2,'Species']))
prop.table(table(iris[index==3,'Species']))
prop.table(table(iris[index==4,'Species']))
prop.table(table(iris[index==5,'Species']))

### 5.2 数据清洗 ###
# 缺失值判断及处理
player <- read.csv("../data/玩家玩牌数据.csv")
head(player)

# 使用is.na()函数对元素是否为缺失值进行判断
head(is.na(player))
# 统计玩牌局数变量的缺失值与非缺失值的个数
table(is.na(player$玩牌局数))
# 计算缺失值个数
sum(is.na(player$玩牌局数))
# 计算缺失值占比
mean(is.na(player$玩牌局数))

# 利用complete.cases函数查看完整实例
sum(complete.cases(player))

# 用md.pattern函数查看player的缺失值模式
if(!require(mice)) install.packages("mice")
md.pattern(player)

# 用aggr函数对player数据的缺失值模式进行可视化
if(!require(VIM)) install.packages("VIM")
aggr(player,prop=FALSE,numbers=TRUE)

# 删除缺失样本
player_complete <- na.omit(player)
# 计算有缺失值的样本个数
sum(!complete.cases(player_complete))
# 判断每一列的缺失值占比是否小于0.5
(opt <- apply(player,2,function(x){mean(is.na(x))})<0.5) 
# 剔除缺失值占比超过50%的列
player_opt <- player[,opt]
head(player_opt)

# apply函数
(M <- matrix(1:8,nrow = 2))
apply(M,1,sum) #对每一行进行求和
apply(M,2,sum)  #对每一列进行求和

# 替换缺失值
iris1 <- iris[,c(1,5)]
# 将40、80、120号样本的Sepal.Length变量值设置为缺失值
iris1[c(40,80,120),1] <- NA
# 利用均值替换缺失值
iris1[c(40,80,120),1] <- round(mean(iris1$Sepal.Length,na.rm = T),1)
# 查看以前的值和现在的值
iris[c(40,80,120),1];iris1[c(40,80,120),1]
# 绘制箱线图
plot(iris$Sepal.Length~iris$Species,col=heat.colors(3))

# 利用同类均值进行赋值的方式来填补缺失值
# 将40、80、120号样本的Sepal.Length设置为缺失值
iris1[c(40,80,120),1] <- NA
iris1[40,1] <- round(mean(iris1[iris1$Species=='setosa','Sepal.Length'],
                          na.rm = T),1)
iris1[80,1] <- round(mean(iris1[iris1$Species=='versicolor','Sepal.Length'],
                          na.rm = T),1)
iris1[120,1] <- round(mean(iris1[iris1$Species=='virginica','Sepal.Length'],
                           na.rm = T),1)
# 查看以前的值和现在的值
iris[c(40,80,120),1];iris1[c(40,80,120),1]

# 利用missForest进行缺失值赋值
iris1 <- iris
iris1[c(40,80,120),1] <- NA
if(!require(missForest)) install.packages("missForest")
mf_model <- missForest(iris1)
iris1_impute <- mf_model$ximp
iris1_impute[c(40,80,120),1]
iris[c(40,80,120),1]

### 5.2.2 异常值判断处理 ###
# 3σ原则
# 绘制质量控制图
set.seed(1234)
data <- rnorm(20)
plot(data,type = "o",pch=16,lwd=1.5,axes=FALSE,xlab=NA,ylab=NA,
     ylim = c(-4,4),xlim = c(0,23),main="质量控制图")
lines(rep(mean(data),20),lwd=1.8);text(21,mean(data),"均值线")
lines(rep(mean(data)-3*sd(data),20),lty=2,col="red",lwd=1.8)
text(21,mean(data)-3*sd(data),labels = "控制下限",col="red")
lines(rep(mean(data)+3*sd(data),20),lty=2,col="red",lwd=1.8)
text(21,mean(data)+3*sd(data),labels = "控制上限",col="red")

# P质量控制图
payrate <- read.csv("../data/用户付费数据.csv",T)
head(payrate)
# 绘制付费率的单值-均值质量控制图
library(qcc)
attach(payrate)
qcc(付费率,type="xbar.one",labels= 日期,
         title="用户付费率的单值-均值质量监控图",
         xlab="date",ylab="用户付费率")

# 箱线图分析
# 通过boxplot.stat()函数识别异常值
attach(payrate)
boxplot.stats(付费率)
# 查找异常值的下标
idx <- which(付费率 %in% boxplot.stats(付费率)$out)
# 查看异常值的下标集
idx
# 绘制箱线图
boxplot(付费率,col='violet')
# 通过text函数把异常值的日期和数值在图上显示
text(1.1,boxplot.stats(付费率)$out,
     labels=paste(payrate[idx,'日期'],payrate[idx,'付费率']),
     col="darkgreen")

# 通过聚类进行异常检测
set.seed(1234)
M <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(M) <- c("x", "y")
cl <- kmeans(M, 2) # 将样本分为两群
centers <- cl$centers[cl$cluster,] # 给出每个样本所属类中心值
distances <- sqrt(rowSums((M-centers)^2)) # 计算每个样本与所属类中心的距离
outliers <- order(distances,decreasing = T)[1:5] # 对距离进行降息排序
print(outliers)
print(M[outliers,])

# 对结果进行可视化
plot(M, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)
points(M[outliers,],pch="+",col=4,cex=1.5)

### 5.3 数据变换 ###
# cut函数 
set.seed(1234)
age <- c(0,sample(0:99,99,replace = T))
breaks <- c(0,6,12,17,45,69,Inf) # 划分区间
age_cut <- cut(age,breaks = breaks) # 数据分箱
table(age_cut) # 查看分箱结果
age_cut1 <- cut(age,breaks = breaks,include.lowest = TRUE) # 设置为包含0
table(age_cut1)
label <- c('婴幼儿','少儿','青少年','青年','中年','老年')
age_cut2 <- cut(age,breaks = breaks,labels = label,
                include.lowest = TRUE) 
table(age_cut2)

# 数据标准化
#采用(x-mu)/std的标准化方法，与scale()函数效果一样
library(caret)
standard <- preProcess(iris)  
head(predict(standard,iris))
head(scale(iris[,1:4]))
#采用(x-min(x))/(max(x)-min(x))的标准化方法
standard <- preProcess(iris, method = 'range')  
head(predict(standard,iris))
fun <- function(x) (x-min(x))/(max(x)-min(x))
head(sapply(iris[,1:4],fun))

# 哑变量处理
# 构建customers数据集
customers<-data.frame(id=c(10,20,30,40,50),
                   gender=c("male","female","female","male","female"),
                   mood=c("happy","sad","happy","sad","happy"),
                   outcome=c(1,1,0,0,0))
customers
# 对因子型变量进行哑变量处理
# 创建新数据框customers.new
customers.new <- customers[,c('id','outcome')]
# 对gender变量进行哑变量处理
customers.new$gender.male <- ifelse(customers$gender=='male',1,0)
customers.new$gender.female <- ifelse(customers$gender=='female',1,0)
# 对mood变量进行哑变量处理
customers.new$mood.happy <- ifelse(customers$mood=='happy',1,0)
customers.new$mood.sad <- ifelse(customers$mood=='sad',1,0)
customers.new

# 加载caret包到内存
library(caret)
# 查看customers的数据结构
str(customers)
customers$gender <- as.factor(customers$gender)
customers$mood <- as.factor(customers$mood)
# 利用dummyVars函数对customers数据进行哑变量处理
dmy<-dummyVars(~.,data=customers)
# 对自身变量进行预测，并转换成data.frame格式
trsf<-data.frame(predict(dmy,newdata=customers))
# 查看转换结果
trsf
# 将outcome变量转换成因子型变量
customers$outcome <- as.factor(customers$outcome)
# 利用dummyVars函数对customers数据进行哑变量处理
dmy<-dummyVars(~.,data=customers)
# 对自身变量进行预测，并转换成data.frame格式
trsf<-data.frame(predict(dmy,newdata=customers))
# 查看转换结果
trsf
# 只对gender变量进行哑变量转换
dmy.gender <- dummyVars(~gender,data=customers,levelsOnly=TRUE)
trsf.gender <- data.frame(predict(dmy.gender,newdata=customers))
trsf.gender
# 将levelsOnly和fullRank设置为TRUE
dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE)
trsf<-data.frame(predict(dmy,newdata=customers))
trsf
R语言实现数据预处理

悦读