莫烦numpy和pandas数据处理教程 Pandas

一、Numpy的属性

1
2
3
4
5
6
7
8
import numpy as np

array = np.array([[1,2,3],[2,3,4]])
print(array)

print('number of dim:',array.ndim) #维度
print('shape:',array.shape) #几行几列
print('size:',array.size) #几个元素

二、Numpy创建Array

五、Numpy的索引

1
2
3
4
import numpy as np
A = np.arange(3,15)
print(A) #[3 4 5 6 7 8 9 10 11 12 13 14]
print(A[3]) #6

矩阵换成二维的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import numpy as np
A = np.arange(3,15).reshape((3,4))
print(A) #[[3 4 5 6]
[7 8 9 10]
[11 12 13 14]]
print(A[2]) #[11 12 13 14]
print(A[1][1]) #8
print(A[1,1]) #8 与上述效果一样
print(A[:,1]) #第2列的所有数
print(A[1,:]) #第2行的所有数
print(A[1,1:2]) #[8]
print(A[1,1:3]) #[8,9]

for row in A:
print(row) #按行输出

for column in A.T:
print(column) #按列输出

print(A.flatten()) #把矩阵变成一维

for item in A.flat:
print(item) #flat和flatten()功能一样,返回值不一样。返回一维矩阵用flatten(),flat是迭代器,返回的是一串类似地址值的值

三、Numpy的Array合并

合并

1
2
3
4
5
6
7
8
9
10
import numpy as np

A = np.array([1,1,1])
B = np.array([2,2,2])

print(np.vstack(A,B)) #vertical stack 上下合并
# [1,1,1]
[2,2,2]
print(np.hstack(A,B)) #horizontal stack 左右合并
# [1,1,1,2,2,2]

增加维度

1
2
3
4
5
6
print(A[:,np.newaxis]) #增加列维度
# [[1]
[1]
[1]]
print(A[np.newaxis,:]) #增加行维度
# [[1,1,1]]

例子:

1
2
3
4
5
6
7
8
9
A = np.array([1,1,1])[:,np.newaxis]
B = np.array([2,2,2])[:,np.newaxis]
C = np.hstack(A,B)
print(C)
# [1,2]
[1,2]
[1,2]
print(A.shape,C.shape)
# (3,1) (3,2)

多个Array合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
C = np.concatenate((A,B,B,A),axis=0) 
# vstack和hstack都可以进行多个Array合并,
# concatenate的不同在于可以在后面指定合并的方向。
# 0为上下合并,1为水平合并
print(C)
#
[[1]
[1]
[1]
[2]
[2]
[2]
[2]
[2]
[2]
[1]
[1]
[1]]

四、Numpy的分割

先生成一个array:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import numpy as np

A = np.arange(12).reshape((3,4))
print(A)
#
[[0,1,2]
[3,4,5]
[6,7,8]
[9,10,11]]
print(np.split(A,2,axis=1))
#1是上下分开,0是左右分开
#
[array([[0,1],
[4,5],
[8,9]]),
array([[2,3],
[6,7],
[10,11]])
]

split只能进行相同分割,进行不相等分割要使用array_split

1
2
3
4
5
6
7
8
9
10
11
12
print(np.array_split(A,3,axis=1))
#
[array([[0,1],
[4,5],
[8,9]]),
array([[2],
[6],
[10]])
array([[3],
[7],
[11]])
]

相对于合并,分割也有vsplit和hsplit

1
2
3
4
5
6
7
8
9
10
11
12
print(np.vsplit(A,3))
#
[array([[0,1,2,3]])],[array([[4,5,6,7]])],[array([[8,9,10,11]])]
print(np.hsplit(A,2))
#
[array([[0,1],
[4,5],
[8,9]]),
array([[2,3],
[6,7],
[10,11]])
]

五、Numpy的copy和deep copy

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np

a = np.arange(4)
b = a
c = a
d = b
a[0] = 11 #默认格式是整数
# 此时a,b,c,d都会改变,因为python中这种赋值方式下a,b,c,d是完全一样的
d[1:3] = [22,33]
# 此时a,b.c,d也会全部发生改变
b = a.copy() #deep copy
a[3] = 44
#
a = [11,22,33,44]
b = [11,22,33,3]
deep copy后,a变化不会使b变化

Pandas

一、Pandas的基本介绍

Series

1
2
3
4
5
6
7
8
9
10
11
12
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
#
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64

DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
dates = pd.date_range('20160101',periods=6)
print(dates)
# 先生成一个索引
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
# 生成DataFrame
a b c d
2016-01-01 0.545643 2.021614 1.210559 0.467834
2016-01-02 0.666108 0.419161 -1.943871 -0.953263
2016-01-03 -0.759923 -0.337458 -1.865560 0.328037
2016-01-04 0.198904 0.880258 -1.369046 0.885945
2016-01-05 -0.828801 -0.602095 -0.897741 0.168277
2016-01-06 0.937178 -0.593869 0.043363 -0.273371

没有索引生成DataFrame

1
2
3
4
5
6
7
df1 = pd.DataFrame(np.random.randn(3,4))
print(df1)
#
0 1 2 3
0 0.378297 -0.354029 0.407989 -0.067838
1 -1.210963 2.706774 0.875143 0.360681
2 -0.964372 0.919133 0.768482 0.221687

按索引排序

1
2
3
4
5
6
7
8
9
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)

print(df1.sort_index(axis=0,ascending=False))
#
0 1 2 3
2 8 9 10 11
1 4 5 6 7
0 0 1 2 3

按值排序

1
2
3
4
5
6
print(df1.sort_values(by=2))
#
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11

二、选择数据

先生成一个DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
import numpy as np

dates = pd.date_range('20130101',periods=6)

df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)

print(df['A'])
print(df.A)
#
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32

print(df[0:3])
print(df['20130102':'20130104'])
#
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
A B C D
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15

从标签来选择

1
2
3
4
5
6
7
print(df.loc['20130102'])
#
A 4
B 5
C 6
D 7
Name: 2013-01-02 00:00:00, dtype: int32

从列标签中选择,并输出所有行结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
print(df.loc[:,['A','B']])
#
A B
2013-01-01 0 1
2013-01-02 4 5
2013-01-03 8 9
2013-01-04 12 13
2013-01-05 16 17
2013-01-06 20 21

print(df.loc['20130102':,['A','B']])
A B
2013-01-02 4 5
2013-01-03 8 9
2013-01-04 12 13
2013-01-05 16 17
2013-01-06 20 21

从位置选择

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
print(df.iloc[3]) #第4行的数据
#
A 12
B 13
C 14
D 15
Name: 2013-01-04 00:00:00, dtype: int32

print(df.iloc[3,1]) #第4行第2个数据
#
13

print(df.iloc[3:5,1:3]) #第4行到第6行,第2位到第4位
#
B C
2013-01-04 13 14
2013-01-05 17 18

print(df.iloc[[1,3,5],1:3]) #第2,4,6行...
#
B C
2013-01-02 5 6
2013-01-04 13 14
2013-01-06 21 22

混合选择

1
2
3
4
5
6
print(df.ix[:3,['A','C']])
#
A C
2013-01-01 0 2
2013-01-02 4 6
2013-01-03 8 10

其它

1
2
3
4
5
6
7
8
9
10
11
12
13
14
print(df)
print(df[df.A>8])
#
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A B C D
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23

三、Pandas设置值

先生成一个DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
#
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23

df.iloc[2,2] = 1111
print(df)
#
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 1111 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23

df.loc['20130101','A'] = 2222
print(df)
#
A B C D
2013-01-01 2222 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 1111 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23

另一种设置值的方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
df[df.A>4] = 0
print(df)
#
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 0 0 0 0
2013-01-04 0 0 0 0
2013-01-05 0 0 0 0
2013-01-06 0 0 0 0

df.A[df.A>4] = 0
print(df)
#
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 0 9 10 11
2013-01-04 0 13 14 15
2013-01-05 0 17 18 19
2013-01-06 0 21 22 23

加一列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
df['F'] = np.nan
print(df)
#
A B C D F
2013-01-01 0 1 2 3 NaN
2013-01-02 4 5 6 7 NaN
2013-01-03 8 9 10 11 NaN
2013-01-04 12 13 14 15 NaN
2013-01-05 16 17 18 19 NaN
2013-01-06 20 21 22 23 NaN

df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(df)
#
A B C D E
2013-01-01 0 1 2 3 1
2013-01-02 4 5 6 7 2
2013-01-03 8 9 10 11 3
2013-01-04 12 13 14 15 4
2013-01-05 16 17 18 19 5
2013-01-06 20 21 22 23 6

四、Pandas处理异常数据

生成一个带有Nan的DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pandas as pd
import numpy as np

dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan

print(df)
#
A B C D
2013-01-01 0 NaN 2.0 3
2013-01-02 4 5.0 NaN 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23

print(df.dropna(axis=0,how='any'))
#
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23

丢掉带有数据的行或列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
print(df.dropna(axis=0,how='any')) #丢掉行
#
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23

print(df.dropna(axis=1,how='any')) #丢掉列
#
A D
2013-01-01 0 3
2013-01-02 4 7
2013-01-03 8 11
2013-01-04 12 15
2013-01-05 16 19
2013-01-06 20 23

print(df.dropna(axis=0,how='all')) #行或列全是Nan才删除

将没有填的空,填上数据

1
2
3
4
5
6
7
8
9
print(df.fillna(value=0))
#
A B C D
2013-01-01 0 0.0 2.0 3
2013-01-02 4 5.0 0.0 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23

判断整个表格中数据是否存在

1
2
3
4
5
6
7
8
9
print(df.isnull())
#
A B C D
2013-01-01 False True False False
2013-01-02 False False True False
2013-01-03 False False False False
2013-01-04 False False False False
2013-01-05 False False False False
2013-01-06 False False False False

数据太多,没办法每行每列找,可以用下列方法

1
2
3
print(np.any(pd.isnull()) == True)
#
True

五、Pandas导入导出

导入表格

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import pandas as pd
import numpy as np

data = pd.read_csv('C:/Users/zz/Desktop/Python training/1.csv')
print(data)
#
Student ID Name Gender Language
0 1 A male C
1 2 B female E
2 3 C male A
3 4 D female C
4 5 E male E
5 6 F male A
6 7 G female A
7 8 H female E
8 9 I female C

导出表格

1
data.to_pickle('2.pickle')

六、Pandas合并 concate

生成三个DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import pandas as pd
import numpy as np

df1 =pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 =pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 =pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
#
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0

第一种合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
res = pd.concat([df1,df2,df3],axis=0)
print(res)
#
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0

res = pd.concat([df1,df2,df3],axis=1)
print(res)
#
a b c d a b c d a b c d
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0

重新赋予左边的索引

1
2
3
4
5
6
7
8
9
10
11
12
13
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(res)
#
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0

另一种合并(‘outer’,’inner’)
先生成两个DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])
print(df1)
print(df2)
#
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0

内外连接(默认为outer)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
res = pd.concat([df1,df2],join='outer')
print(res)
#
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0

res = pd.concat([df1,df2],join='inner')
print(res)
#
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0

res = pd.concat([df1,df2],join='inner',ignore_index=True)
print(res)
#
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0

根据索引连接

1
2
3
4
5
6
7
8
9
df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)
#
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0

append添加

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
df1 =pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 =pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 =pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = df1.append(df2,ignore_index=True)
print(res)
#
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0

res = df1.append([df2,df3],ignore_index=True)
print(res)
#
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0

七、Pandas合并 merge

生成两个DataFrame

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import pandas as pd
import numpy as np

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
#
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3

merge

1
2
3
4
5
6
7
8
res = pd.merge(left,right,on='key')
print(res)
#
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3

两列key

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
res = pd.merge(left,right,on=['key1','key2'])
print(res)
#
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
res = pd.merge(left,right,on=['key1','key2'],how='outer')
# how = ['left','right','outer','inner']
print(res)
#
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3

indicator
生成DataFrame

1
2
3
4
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)

使用indicator

1
2
3
4
5
6
7
8
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
#
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only

index

1
2
3
4
5
6
7
8
9
10
11
12
13
14
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
#
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3

handle overlapping

1
2
3
4
5
6
7
8
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
#
k age_boy age_girl
0 K0 1 4
1 K0 1 5

八、Pandas Plot画图

生成Series

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()

data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()

ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)

plt.show()