ipython常用

Import

import numpy as np
import pandas as pd
from scipy.stats import describe

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb

读取数据

df_train = pd.read_csv('./input/train.csv')
df_train.shape
df_train.head()
df_train.info()
df_train.describe()

绘图


plt.figure(figsize=(12, 5))
plt.hist(train_df.target.values, bins=100)
plt.title('Histogram target counts')
plt.xlabel('Count')
plt.ylabel('Target')
plt.show()

# violin
sns.set_style("whitegrid")
sns.violinplot(x=np.log1p(df_train.target.values))

运算

# 取Log
target_log = np.log1p(df_train['target'].values)
target_lgo = np.log(1+df_train['target'].values)

常数列

# 发现
constant_train = train_df.loc[:, (train_df == train_df.iloc[0]).all()].columns.tolist()
constant_test = test_df.loc[:, (test_df == test_df.iloc[0]).all()].columns.tolist()
print('Number of constant columns in the train set:', len(constant_train))
print('Number of constant columns in the test set:', len(constant_test))

# 删除
columns_to_use = test_df.columns.tolist()
columns_to_use = [x for x in columns_to_use if x not in constant_train] #Remove all 0 columns
len(columns_to_use)

非零项

# 全部0计数
(df_train[columns_to_use].values.flatten() == 0).mean()

# 按列0计数
train_zeros = pd.DataFrame({'Percentile':((train_df[columns_to_use].values)==0).mean(axis=0),
'Column' : columns_to_use})

# np.nonzero
train_nz = np.log1p(df_train.values.flatten())
train_nz = train_nz[np.nonzero(train_nz)]
plt.hist(train_nz, bins=50)

重复Duplicated

缺失Missing

df_train.isnull.values.any()

异常Outliers