Melbourne housing price predicition
1
2
3
|
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
|
1
2
|
melbourne_file_path = 'D:\MLlearn\melb_data1.csv'
melb_data = pd.read_csv(melbourne_file_path)
|
1
2
|
melb_target = melb_data.Price
melb_predictors = melb_data.drop(['Price'], axis = 1)
|
1
2
|
# For the sake of keeping the example simple, we'll use only numeric predictors.
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])
|
1
2
3
4
5
|
X_train, X_test, y_train, y_test = train_test_split(melb_numeric_predictors,
melb_target,
train_size=0.7,
test_size=0.3,
random_state=0)
|
1
2
3
4
5
|
def score_dataset(X_train, X_test, y_train, y_test):
model = RandomForestRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)
return mean_absolute_error(y_test, preds)
|
1
2
3
4
5
6
|
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))
|
1
2
|
Mean Absolute Error from dropping columns with Missing Values:
186476.80443579494
|
1
2
3
4
|
from sklearn.preprocessing import Imputer
# It seems that sklearn.impute is not available now, replace it by sklearn.processing.imputer.
|
1
2
3
4
5
|
my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))
|
1
2
|
Mean Absolute Error from Imputation:
182179.6435198822
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()
cols_with_missing = (col for col in X_train.columns
if X_train[col].isnull().any())
for col in cols_with_missing:
imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
# Imputation
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)
print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))
|
1
2
|
Mean Absolute Error from Imputation while Track What Was Imputed:
183829.18093601702
|
|
Suburb |
Address |
Rooms |
Type |
Price |
Method |
SellerG |
Date |
Distance |
Postcode |
... |
Bathroom |
Car |
Landsize |
BuildingArea |
YearBuilt |
CouncilArea |
Lattitude |
Longtitude |
Regionname |
Propertycount |
0 |
Abbotsford |
85 Turner St |
2 |
h |
1480000.0 |
S |
Biggin |
3/12/2016 |
2.5 |
3067.0 |
... |
1.0 |
1.0 |
202.0 |
NaN |
NaN |
Yarra |
-37.7996 |
144.9984 |
Northern Metropolitan |
4019.0 |
1 |
Abbotsford |
25 Bloomburg St |
2 |
h |
1035000.0 |
S |
Biggin |
4/02/2016 |
2.5 |
3067.0 |
... |
1.0 |
0.0 |
156.0 |
79.0 |
1900.0 |
Yarra |
-37.8079 |
144.9934 |
Northern Metropolitan |
4019.0 |
2 |
Abbotsford |
5 Charles St |
3 |
h |
1465000.0 |
SP |
Biggin |
4/03/2017 |
2.5 |
3067.0 |
... |
2.0 |
0.0 |
134.0 |
150.0 |
1900.0 |
Yarra |
-37.8093 |
144.9944 |
Northern Metropolitan |
4019.0 |
3 |
Abbotsford |
40 Federation La |
3 |
h |
850000.0 |
PI |
Biggin |
4/03/2017 |
2.5 |
3067.0 |
... |
2.0 |
1.0 |
94.0 |
NaN |
NaN |
Yarra |
-37.7969 |
144.9969 |
Northern Metropolitan |
4019.0 |
4 |
Abbotsford |
55a Park St |
4 |
h |
1600000.0 |
VB |
Nelson |
4/06/2016 |
2.5 |
3067.0 |
... |
1.0 |
2.0 |
120.0 |
142.0 |
2014.0 |
Yarra |
-37.8072 |
144.9941 |
Northern Metropolitan |
4019.0 |
5 rows × 21 columns
近期评论