## Case 01 - Real Estate

Consider you want to analyse the real estate market. It was not possible to obtain data related to each appartment. So it was used the information from national The information you have available is the average information in each municipality. 

'https://github.com/masterfloss/data/blob/main/realEstate1.xlsx?raw=true'

Create a model that explains the price. 

In [1]:
# Import libraries

import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as metrics

In [2]:
# Preprocessing

df=pd.read_excel('https://github.com/masterfloss/data/blob/main/realEstate1.xlsx?raw=true')
df1=df
df1["waste2018"]=pd.to_numeric(df1.waste2018, errors='coerce')
df1["tourism"]=pd.to_numeric(df1.tourism2018, errors='coerce')
df1["wasteSel2018"]=pd.to_numeric(df1.wasteSel2018, errors='coerce')
df1["wage"]=pd.to_numeric(df1.wage2018, errors='coerce')
df1['waste']=df1["wasteSel2018"]/df1["waste2018"]
df1['tax']=df1['IMT2018percapita']+df1['IMI2018percapita'] 
df1=df.dropna()
df1.dtypes

Unnamed: 0              object
price2018              float64
price2000                int64
purchacingPower2017    float64
crime2019              float64
crime1993              float64
wage2018               float64
waste2018              float64
wasteSel2018           float64
IMT2018percapita       float64
IMI2018percapita       float64
tourism2018             object
wage2018.1              object
grad                   float64
tourism                float64
wage                   float64
waste                  float64
tax                    float64
dtype: object

In [3]:
# Creating a regression model
y=df1['price2018']
X= df1[['purchacingPower2017','crime2019','waste','tourism','tax']]

X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
results.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,price2018,R-squared:,0.771
Model:,OLS,Adj. R-squared:,0.767
Method:,Least Squares,F-statistic:,215.8
Date:,"Sun, 21 Nov 2021",Prob (F-statistic):,2.2600000000000002e-100
Time:,16:13:58,Log-Likelihood:,-3774.3
No. Observations:,327,AIC:,7561.0
Df Residuals:,321,BIC:,7583.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.593e+04,7109.203,-10.681,0.000,-8.99e+04,-6.19e+04
purchacingPower2017,1241.9925,93.044,13.348,0.000,1058.940,1425.045
crime2019,-610.6191,195.320,-3.126,0.002,-994.889,-226.350
waste,7.775e+04,1.97e+04,3.947,0.000,3.9e+04,1.17e+05
tourism,-3.7013,1.177,-3.145,0.002,-6.017,-1.386
tax,227.1894,14.862,15.287,0.000,197.951,256.428

0,1,2,3
Omnibus:,60.56,Durbin-Watson:,1.24
Prob(Omnibus):,0.0,Jarque-Bera (JB):,142.319
Skew:,0.908,Prob(JB):,1.25e-31
Kurtosis:,5.674,Cond. No.,24400.0


In [4]:
# VIF (Variance inflation factor) is a measure of the amount of multicollinearity 
# in a set of multiple regression variables.

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

               feature        VIF
0                const  26.125925
1  purchacingPower2017   1.475335
2            crime2019   1.887041
3                waste   1.332964
4              tourism   1.792031
5                  tax   2.547069


In [5]:
# Compare several algorithms in order to identify the one that predicts better.
# Split train and test

# Linear Regression
# Ridge Regression
# Lasso Regression
# Bayesian Regression
# Polynomial Regression
# Neural Network
# Random Forest
# Gradient Boosting 


In [6]:
# Split train and test

y=df1['price2018']-df1['price2000']
X= df1[['purchacingPower2017','crime2019','IMT2018percapita','IMI2018percapita','waste']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

y_test_predict={}

In [7]:
# Linear Regression

reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))
LR_ATrain=reg.score(X_train, y_train)
LR_ATest=reg.score(X_test, y_test)
y_test_predict['linear']=reg.predict(X_test)

Accuracy on the training subset: 0.763
Accuracy on the test subset: 0.642


In [8]:
# Ridge Regression

reg = linear_model.Ridge (alpha = .5)
reg.fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))

y_test_predict['Ridge']=reg.predict(X_test)

Accuracy on the training subset: 0.762
Accuracy on the test subset: 0.644


In [9]:
#Lasso Regression

reg = linear_model.Lasso(alpha = .5)
reg.fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))

y_test_predict['Lasso']=reg.predict(X_test)

Accuracy on the training subset: 0.763
Accuracy on the test subset: 0.642


In [10]:
#Bayesian Regression

reg = linear_model.BayesianRidge(compute_score=True)
reg.fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))

y_test_predict['Bayesian']=reg.predict(X_test)

Accuracy on the training subset: 0.757
Accuracy on the test subset: 0.642


In [11]:
#Polynomial Regression

reg = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', LinearRegression(fit_intercept=False))])
reg.fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))

y_test_predict['Poli']=reg.predict(X_test)

Accuracy on the training subset: 0.818
Accuracy on the test subset: 0.426


In [12]:
#Neural Network

reg = MLPRegressor(random_state=1,hidden_layer_sizes = (9,7), activation='relu', max_iter=5000, solver='lbfgs')
reg.fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))

y_test_predict['NN']=reg.predict(X_test)

Accuracy on the training subset: 0.755
Accuracy on the test subset: 0.619


In [13]:
#Random Forest

from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=69, max_depth=3, random_state=0)
reg.fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))


y_test_predict['Rforest']=reg.predict(X_test)

Accuracy on the training subset: 0.838
Accuracy on the test subset: 0.644


In [14]:
# Gradient Boosting 

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
print('Accuracy on the training subset: {:.3f}'.format(reg.score(X_train, y_train)))
print('Accuracy on the test subset: {:.3f}'.format(reg.score(X_test, y_test)))


y_test_predict['GradientBoosting']=reg.predict(X_test)

Accuracy on the training subset: 0.838
Accuracy on the test subset: 0.644


In [15]:
# Use metrics to compare

DicMeasures={}
DicMeasures={'Measures':['MSE','MedSE','MAE','EV','R2']}
for a in y_test_predict.keys():
    # Mean absolute error 
    DicMeasures[a]= []
    DicMeasures[a].append(round(metrics.mean_absolute_error(y_test, y_test_predict[a]),2))
    # Mean squared error 
    DicMeasures[a].append(round(metrics.mean_squared_error(y_test, y_test_predict[a]),2))
    # Median absolute error 
    DicMeasures[a].append(round(metrics.median_absolute_error(y_test, y_test_predict[a]),2))
    # Explain variance score 
    DicMeasures[a].append(round(metrics.explained_variance_score(y_test, y_test_predict[a]),2)) 
    # R2 score 
    DicMeasures[a].append(round(metrics.r2_score(y_test, y_test_predict[a]),2))
    
df=pd.DataFrame(data=DicMeasures)
pd.options.display.float_format = '{:.2f}'.format
df.set_index('Measures').T

Measures,MSE,MedSE,MAE,EV,R2
linear,10320.86,212656647.85,7763.38,0.64,0.64
Ridge,10174.21,211434634.52,7963.67,0.65,0.64
Lasso,10319.36,212635240.68,7755.35,0.64,0.64
Bayesian,9917.78,212438960.28,7803.11,0.64,0.64
Poli,11331.0,340287984.58,7179.03,0.44,0.43
NN,10322.48,225913545.82,8426.01,0.63,0.62
Rforest,10597.78,211291876.16,8017.24,0.66,0.64
GradientBoosting,10597.78,211291876.16,8017.24,0.66,0.64
