一、Numpy的属性

import numpy as np

array = np.array([[1,2,3],[2,3,4]])
print(array)

print('number of dim:',array.ndim) #维度
print('shape:',array.shape) #几行几列
print('size:',array.size) #几个元素

二、Numpy创建Array

五、Numpy的索引

import numpy as np
A = np.arange(3,15)
print(A) #[3 4 5 6 7 8 9 10 11 12 13 14]
print(A[3]) #6

矩阵换成二维的

import numpy as np
A = np.arange(3,15).reshape((3,4))
print(A) #[[3 4 5 6]
	  [7 8 9 10]
	  [11 12 13 14]]
print(A[2]) #[11 12 13 14]
print(A[1][1]) #8 
print(A[1,1]) #8 与上述效果一样
print(A[:,1]) #第2列的所有数
print(A[1,:]) #第2行的所有数
print(A[1,1:2]) #[8]
print(A[1,1:3]) #[8,9]

for row in A:
  print(row) #按行输出

for column in A.T:
  print(column) #按列输出

print(A.flatten()) #把矩阵变成一维

for item in A.flat:
  print(item) #flat和flatten()功能一样，返回值不一样。返回一维矩阵用flatten(),flat是迭代器，返回的是一串类似地址值的值

三、Numpy的Array合并

合并

import numpy as np

A = np.array([1,1,1])
B = np.array([2,2,2])

print(np.vstack(A,B)) #vertical stack 上下合并
# [1,1,1]
  [2,2,2]
print(np.hstack(A,B)) #horizontal stack 左右合并
# [1,1,1,2,2,2]

增加维度

print(A[:,np.newaxis]) #增加列维度
# [[1]
   [1]
   [1]]
print(A[np.newaxis,:]) #增加行维度
# [[1,1,1]]

例子：

A = np.array([1,1,1])[:,np.newaxis]
B = np.array([2,2,2])[:,np.newaxis]
C = np.hstack(A,B)
print(C)
# [1,2]
  [1,2]
  [1,2]
print(A.shape,C.shape)
# (3,1) (3,2)

多个Array合并

C = np.concatenate((A,B,B,A)，axis=0) 
# vstack和hstack都可以进行多个Array合并，
# concatenate的不同在于可以在后面指定合并的方向。
# 0为上下合并，1为水平合并
print(C)
#
[[1]
 [1]
 [1]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [1]
 [1]
 [1]]

四、Numpy的分割

先生成一个array：

import numpy as np

A = np.arange(12).reshape((3,4))
print(A)
# 
[[0,1,2]
[3,4,5]
[6,7,8]
[9,10,11]]
print(np.split(A,2,axis=1))
#1是上下分开，0是左右分开
# 
[array([[0,1],
	   [4,5],
	   [8,9]]),
 array([[2,3],
 	    [6,7],
 	    [10,11]])
]

split只能进行相同分割，进行不相等分割要使用array_split

print(np.array_split(A,3,axis=1))
#
[array([[0,1],
	   [4,5],
	   [8,9]]),
 array([[2],
 	    [6],
 	    [10]])
 array([[3],
 	    [7],
 	    [11]])
]

相对于合并，分割也有vsplit和hsplit

print(np.vsplit(A,3))
#
[array([[0,1,2,3]])],[array([[4,5,6,7]])],[array([[8,9,10,11]])]
print(np.hsplit(A,2))
#
[array([[0,1],
	   [4,5],
	   [8,9]]),
 array([[2,3],
 	    [6,7],
 	    [10,11]])
]

五、Numpy的copy和deep copy

import numpy as np

a = np.arange(4)
b = a
c = a
d = b
a[0] = 11 #默认格式是整数
# 此时a,b,c,d都会改变，因为python中这种赋值方式下a,b,c,d是完全一样的
d[1:3] = [22,33]
# 此时a,b.c,d也会全部发生改变
b = a.copy() #deep copy
a[3] = 44
#
a = [11,22,33,44]
b = [11,22,33,3]
deep copy后,a变化不会使b变化

Pandas

一、Pandas的基本介绍

Series

import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
#
0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

DataFrame

dates = pd.date_range('20160101',periods=6)
print(dates)
# 先生成一个索引
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
# 生成DataFrame
                   a         b         c         d
2016-01-01  0.545643  2.021614  1.210559  0.467834
2016-01-02  0.666108  0.419161 -1.943871 -0.953263
2016-01-03 -0.759923 -0.337458 -1.865560  0.328037
2016-01-04  0.198904  0.880258 -1.369046  0.885945
2016-01-05 -0.828801 -0.602095 -0.897741  0.168277
2016-01-06  0.937178 -0.593869  0.043363 -0.273371

没有索引生成DataFrame

df1 = pd.DataFrame(np.random.randn(3,4))
print(df1)
# 
          0         1         2         3
0  0.378297 -0.354029  0.407989 -0.067838
1 -1.210963  2.706774  0.875143  0.360681
2 -0.964372  0.919133  0.768482  0.221687

按索引排序

df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)

print(df1.sort_index(axis=0,ascending=False)) 
#
   0  1   2   3
2  8  9  10  11
1  4  5   6   7
0  0  1   2   3

按值排序

print(df1.sort_values(by=2))
#
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11

二、选择数据

先生成一个DataFrame

import pandas as pd
import numpy as np

dates = pd.date_range('20130101',periods=6)

df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)

print(df['A'])
print(df.A)
#
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32
2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32

print(df[0:3])
print(df['20130102':'20130104'])
#
            A  B   C   D
2013-01-01  0  1   2   3
2013-01-02  4  5   6   7
2013-01-03  8  9  10  11
             A   B   C   D
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15

从标签来选择

print(df.loc['20130102'])
#
A    4
B    5
C    6
D    7
Name: 2013-01-02 00:00:00, dtype: int32

从列标签中选择，并输出所有行结果

print(df.loc[:,['A','B']])
#
             A   B
2013-01-01   0   1
2013-01-02   4   5
2013-01-03   8   9
2013-01-04  12  13
2013-01-05  16  17
2013-01-06  20  21

print(df.loc['20130102':,['A','B']])
             A   B
2013-01-02   4   5
2013-01-03   8   9
2013-01-04  12  13
2013-01-05  16  17
2013-01-06  20  21

从位置选择

print(df.iloc[3]) #第4行的数据
#
A    12
B    13
C    14
D    15
Name: 2013-01-04 00:00:00, dtype: int32

print(df.iloc[3，1]) #第4行第2个数据
#
13

print(df.iloc[3:5，1:3]) #第4行到第6行，第2位到第4位
#
             B   C
2013-01-04  13  14
2013-01-05  17  18

print(df.iloc[[1,3,5],1:3]) #第2，4，6行...
#
             B   C
2013-01-02   5   6
2013-01-04  13  14
2013-01-06  21  22

混合选择

print(df.ix[:3,['A','C']])
#
            A   C
2013-01-01  0   2
2013-01-02  4   6
2013-01-03  8  10

其它

print(df)
print(df[df.A>8])
#
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
             A   B   C   D
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23

三、Pandas设置值

先生成一个DataFrame

dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
#
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23

df.iloc[2,2] = 1111
print(df)
#
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
             A   B     C   D
2013-01-01   0   1     2   3
2013-01-02   4   5     6   7
2013-01-03   8   9  1111  11
2013-01-04  12  13    14  15
2013-01-05  16  17    18  19
2013-01-06  20  21    22  23

df.loc['20130101','A'] = 2222
print(df)
#
               A   B     C   D
2013-01-01  2222   1     2   3
2013-01-02     4   5     6   7
2013-01-03     8   9  1111  11
2013-01-04    12  13    14  15
2013-01-05    16  17    18  19
2013-01-06    20  21    22  23

另一种设置值的方法

df[df.A>4] = 0
print(df)
#
            A  B  C  D
2013-01-01  0  1  2  3
2013-01-02  4  5  6  7
2013-01-03  0  0  0  0
2013-01-04  0  0  0  0
2013-01-05  0  0  0  0
2013-01-06  0  0  0  0

df.A[df.A>4] = 0
print(df)
#
            A   B   C   D
2013-01-01  0   1   2   3
2013-01-02  4   5   6   7
2013-01-03  0   9  10  11
2013-01-04  0  13  14  15
2013-01-05  0  17  18  19
2013-01-06  0  21  22  23

加一列

df['F'] = np.nan
print(df)
#
             A   B   C   D   F
2013-01-01   0   1   2   3 NaN
2013-01-02   4   5   6   7 NaN
2013-01-03   8   9  10  11 NaN
2013-01-04  12  13  14  15 NaN
2013-01-05  16  17  18  19 NaN
2013-01-06  20  21  22  23 NaN

df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(df)
#
             A   B   C   D  E
2013-01-01   0   1   2   3  1
2013-01-02   4   5   6   7  2
2013-01-03   8   9  10  11  3
2013-01-04  12  13  14  15  4
2013-01-05  16  17  18  19  5
2013-01-06  20  21  22  23  6

四、Pandas处理异常数据

生成一个带有Nan的DataFrame

import pandas as pd
import numpy as np

dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan

print(df)
#
             A     B     C   D
2013-01-01   0   NaN   2.0   3
2013-01-02   4   5.0   NaN   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23

print(df.dropna(axis=0,how='any'))
#
             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23

丢掉带有数据的行或列

print(df.dropna(axis=0,how='any')) #丢掉行
#
             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23

print(df.dropna(axis=1,how='any')) #丢掉列
#
             A   D
2013-01-01   0   3
2013-01-02   4   7
2013-01-03   8  11
2013-01-04  12  15
2013-01-05  16  19
2013-01-06  20  23

print(df.dropna(axis=0,how='all')) #行或列全是Nan才删除

将没有填的空，填上数据

print(df.fillna(value=0))
#
             A     B     C   D
2013-01-01   0   0.0   2.0   3
2013-01-02   4   5.0   0.0   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23

判断整个表格中数据是否存在

print(df.isnull())
#
                A      B      C      D
2013-01-01  False   True  False  False
2013-01-02  False  False   True  False
2013-01-03  False  False  False  False
2013-01-04  False  False  False  False
2013-01-05  False  False  False  False
2013-01-06  False  False  False  False

数据太多，没办法每行每列找，可以用下列方法

1
2
3

print(np.any(pd.isnull()) == True)
#
True

五、Pandas导入导出

导入表格

import pandas as pd
import numpy as np

data = pd.read_csv('C:/Users/zz/Desktop/Python training/1.csv')
print(data)
#
   Student ID Name  Gender Language
0           1    A    male        C
1           2    B  female        E
2           3    C    male        A
3           4    D  female        C
4           5    E    male        E
5           6    F    male        A
6           7    G  female        A
7           8    H  female        E
8           9    I  female        C

导出表格

1	data.to_pickle('2.pickle')

六、Pandas合并 concate

生成三个DataFrame

import pandas as pd
import numpy as np

df1 =pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 =pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 =pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
#
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

第一种合并

res = pd.concat([df1,df2,df3],axis=0)
print(res)
#
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

res = pd.concat([df1,df2,df3],axis=1)
print(res)
#
     a    b    c    d    a    b    c    d    a    b    c    d
0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
1  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0

重新赋予左边的索引

res = pd.concat([df1,df2,df3],axis=0，ignore_index=True)
print(res)
#
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

另一种合并(‘outer’,’inner’)
先生成两个DataFrame

df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])
print(df1)
print(df2)
#
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0

内外连接（默认为outer）

res = pd.concat([df1,df2],join='outer')
print(res)
#
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0

res = pd.concat([df1,df2],join='inner')
print(res)
#
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0

res = pd.concat([df1,df2],join='inner',ignore_index=True)
print(res)
#
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0

根据索引连接

df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)
#
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0

append添加

df1 =pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 =pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 =pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = df1.append(df2,ignore_index=True)
print(res)
#
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0

res = df1.append([df2,df3],ignore_index=True)
print(res)
#
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

七、Pandas合并 merge

生成两个DataFrame

import pandas as pd
import numpy as np

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
#
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3

merge

res = pd.merge(left,right,on='key')
print(res)
#
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3

两列key

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
res = pd.merge(left,right,on=['key1','key2'])
print(res)
#
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
res = pd.merge(left,right,on=['key1','key2'],how='outer')
# how = ['left','right','outer','inner'] 
print(res)
#
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3

indicator
生成DataFrame

df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)

使用indicator

res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
#
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only

index

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
#
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3

handle overlapping

boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
#
    k  age_boy  age_girl
0  K0        1         4
1  K0        1         5

八、Pandas Plot画图

生成Series

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()

data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()

ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)

plt.show()

莫烦numpy和pandas数据处理教程 Pandas