Pandas映射&分组聚合
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
import pandas as pd
from pandas import DataFrame, Series
df = DataFrame( data= np. random. randint( 0 , 100 , size= ( 5 , 6 ) ) )
df
0 1 2 3 4 5 0 43 17 57 61 45 5 1 15 63 83 37 85 37 2 23 84 51 75 34 28 3 71 13 31 55 3 96 4 39 20 13 35 33 3
df. replace( to_replace = 5 , value = 'Five' )
0 1 2 3 4 5 0 43 17 57 61 45 Five 1 15 63 83 37 85 37 2 23 84 51 75 34 28 3 71 13 31 55 3 96 4 39 20 13 35 33 3
df. replace( to_replace = { 5 : 'Five' } )
0 1 2 3 4 5 0 43 17 57 61 45 Five 1 15 63 83 37 85 37 2 23 84 51 75 34 28 3 71 13 31 55 3 96 4 39 20 13 35 33 3
df. replace( to_replace= { 4 : 3 } , value= 'Three' )
0 1 2 3 4 5 0 43 17 57 61 45 5 1 15 63 83 37 85 37 2 23 84 51 75 34 28 3 71 13 31 55 Three 96 4 39 20 13 35 33 3
"""
概念:创建一个映射关系列表,把values元素和一个特定的标签或者字符串绑定(给一个元素值提供不同的表现形式)
创建一个df 两列分别为姓名和薪资,然后给其名字起对应的英文名
"""
dic = {
'name' : [ '张三' , '李四' , '张三' ] ,
'salary' : [ 15000 , 20000 , 15000 ]
}
df = DataFrame( data= dic)
df
name salary 0 张三 15000 1 李四 20000 2 张三 15000
dic = {
'张三' : 'tom' ,
'李四' : 'jack'
}
df[ 'e_name' ] = df[ 'name' ] . map ( dic)
df
name salary e_name 0 张三 15000 tom 1 李四 20000 jack 2 张三 15000 tom
def after_sal ( s) :
return s- ( s- 3000 ) * 0.5
df[ 'after_sal' ] = df[ 'salary' ] . map ( after_sal)
df
name salary e_name after_sal 0 张三 15000 tom 9000.0 1 李四 20000 jack 11500.0 2 张三 15000 tom 9000.0
take( )
np. random. permutation( )
df = df = DataFrame( data= np. random. randint( 0 , 100 , size= ( 100 , 3 ) ) , columns= [ 'A' , 'B' , 'C' ] )
df
A B C 0 50 63 81 1 23 90 67 2 1 72 47 3 50 42 63 4 47 87 69 ... ... ... ... 95 77 98 64 96 91 90 87 97 31 81 43 98 62 71 85 99 68 11 55
100 rows × 3 columns
np. random. permutation( 10 )
array([5, 3, 8, 1, 9, 6, 7, 0, 2, 4])
df. take( [ 2 , 0 , 1 ] , axis= 1 )
df. take( np. random. permutation( 3 ) , axis= 1 )
C B A 0 81 63 50 1 67 90 23 2 47 72 1 3 63 42 50 4 69 87 47 ... ... ... ... 95 64 98 77 96 87 90 91 97 43 81 31 98 85 71 62 99 55 11 68
100 rows × 3 columns
df. take( np. random. permutation( 3 ) , axis= 1 ) . take( np. random. permutation( 100 ) , axis= 0 ) [ 0 : 50 ]
A B C 56 80 55 93 37 46 44 82 78 32 48 42 3 50 42 63 0 50 63 81 86 53 93 20 36 61 74 8 64 52 67 63 95 77 98 64 48 55 63 62 34 36 91 54 11 13 36 28 15 61 19 10 55 99 14 56 33 13 4 89 89 28 75 64 43 38 8 65 10 51 5 3 70 87 6 41 9 18 23 24 28 17 27 46 27 29 3 52 1 23 90 67 82 23 28 74 12 80 77 80 8 22 69 98 65 74 75 90 74 64 75 20 13 6 84 42 66 99 15 96 45 17 93 72 4 47 87 69 7 6 67 16 38 93 39 0 54 4 30 34 94 49 22 26 51 23 48 35 72 86 39 56 73 28 30 79 53 23 95 81 46 37 43 18 92 73 3 3 40 27 48 70 16 57 17 85 29 93 90 95 58 2 37 14 91 63 47 30 57 26 35 98 21 24 10 73 41 92 15 6
分组聚合
df = DataFrame( { 'item' : [ 'Apple' , 'Banana' , 'Orange' , 'Banana' , 'Orange' , 'Apple' ] ,
'price' : [ 4 , 3 , 3 , 2.5 , 4 , 2 ] ,
'color' : [ 'red' , 'yellow' , 'yellow' , 'green' , 'green' , 'green' ] ,
'weight' : [ 12 , 20 , 50 , 30 , 20 , 44 ]
} )
df
item price color weight 0 Apple 4.0 red 12 1 Banana 3.0 yellow 20 2 Orange 3.0 yellow 50 3 Banana 2.5 green 30 4 Orange 4.0 green 20 5 Apple 2.0 green 44
df. groupby( by= 'item' )
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000236FEE0E550>
df. groupby( by= 'item' ) . groups
{'Apple': [0, 5], 'Banana': [1, 3], 'Orange': [2, 4]}
df. groupby( by= 'item' ) [ 'price' ] . mean( )
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
df. groupby( by= 'color' ) [ 'weight' ] . mean( )
color
green 31.333333
red 12.000000
yellow 35.000000
Name: weight, dtype: float64
dic = df. groupby( by= 'color' ) [ 'weight' ] . mean( ) . to_dict( )
df[ 'mean_w' ] = df[ 'color' ] . map ( dic)
df
高级的数据聚合
1.使用groupby 分组后,也可以使用transform和apply提供自定义的函数实现更多的运算 2.df.groupby(‘item’)[‘price’].sum() <= => df.groupby(‘item’)[‘price’].apply(sum) 3.transform 和 apply 都会进行运算,在其中传入函数即可 4.transform 和 apply 也可以传入一个 lambda 表达式
def my_mean ( s) :
m_sum = 0
for i in s:
m_sum += i
return m_sum/ len ( s)
df. groupby( by= 'item' ) [ 'price' ] . transform( my_mean)
0 3.00
1 2.75
2 3.50
3 2.75
4 3.50
5 3.00
Name: price, dtype: float64
df. groupby( by= 'item' ) [ 'price' ] . apply ( my_mean)
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64