{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# lab 11\n", "\n", "***Regression Analysis***\n", "\n", "Suppose you have dataset with cars (CO2_passenger_cars2018b.csv). You what to know what features contribute to the increase of CO2 emission\n", "\n", "1 Import needed libraries\n", "\n", "2 read data from file\n", "\n", "3 list dataset showing first 5 lines. View data types\n", "\n", "4 verify possible values of categoric variable Ft\n", "\n", "5 convert Petrol to PETROL and Diesel to DIESEL\n", "\n", "6 verify again values of categoric variable Ft\n", "\n", "7 convert variables 'm (kg)','ec (cm3)','ep (KW) and 'Enedc (g/km)' to numeric\n", "\n", " If ‘raise’, then invalid parsing will raise an exception\n", " If ‘coerce’, then invalid parsing will be set as NaN\n", " If ‘ignore’, then invalid parsing will return the input\n", "\n", "8 remove all lines with NaN from df dataset and set to XY\n", "\n", "9 create a Y vector and X matrix\n", "\n", "10 create a regression model\n", "\n", "11 analyse correlation between variables. Use seaborn. Like for example in:\n", "\n", " import seaborn as sns\n", " import matplotlib.pyplot as plt\n", " fig = plt.figure(figsize=[12, 12])\n", " corr_mtx = XY.corr()\n", " sns.heatmap(corr_mtx, xticklabels=corr_mtx.columns, yticklabels=corr_mtx.columns, annot=True, cmap='Blues')\n", " plt.title('Correlation analysis')\n", " plt.show()\n", "\n", "12 convert Ft into dummy variables\n", "\n", "13 add dummy variable to a new data set XY2\n", "\n", "14 create a Y vector and X matrix\n", "\n", "15 create a regression model\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.linear_model import LinearRegression\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3049: DtypeWarning: Columns (2,5) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "df=pd.read_csv(\"CO2_passenger_cars2018b.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FabricanteCommercialNamem (kg)Enedc (g/km)Ftec (cm3)ep (KW)
0VOLKSWAGENGOLF1375104.0DIESEL159885.0
1VOLKSWAGENGOLF1340119.0PETROL1498110.0
2VOLKSWAGENGOLF1320120.0PETROL1498110.0
3VOLKSWAGENPOLO1145104.0PETROL99970.0
4VOLKSWAGENPOLO1145104.0PETROL99970.0
\n", "
" ], "text/plain": [ " Fabricante CommercialName m (kg) Enedc (g/km) Ft ec (cm3) ep (KW)\n", "0 VOLKSWAGEN GOLF 1375 104.0 DIESEL 1598 85.0\n", "1 VOLKSWAGEN GOLF 1340 119.0 PETROL 1498 110.0\n", "2 VOLKSWAGEN GOLF 1320 120.0 PETROL 1498 110.0\n", "3 VOLKSWAGEN POLO 1145 104.0 PETROL 999 70.0\n", "4 VOLKSWAGEN POLO 1145 104.0 PETROL 999 70.0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Fabricante object\n", "CommercialName object\n", "m (kg) object\n", "Enedc (g/km) float64\n", "Ft object\n", "ec (cm3) object\n", "ep (KW) float64\n", "dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['DIESEL', 'PETROL', nan, 'Petrol', 'Diesel'], dtype=object)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Ft'].unique()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['DIESEL', 'PETROL', nan], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Ft\"]=df[\"Ft\"].replace(\"Petrol\",\"PETROL\")\n", "df[\"Ft\"]=df[\"Ft\"].replace(\"Diesel\",\"DIESEL\")\n", "df['Ft'].unique()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Fabricante object\n", "CommercialName object\n", "m (kg) float64\n", "Enedc (g/km) float64\n", "Ft object\n", "ec (cm3) float64\n", "ep (KW) float64\n", "dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:,2]=pd.to_numeric(df.iloc[:,2],errors='coerce')\n", "df.iloc[:,5]=pd.to_numeric(df.iloc[:,5],errors='coerce')\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FabricanteCommercialNamem (kg)Enedc (g/km)Ftec (cm3)ep (KW)
0VOLKSWAGENGOLF1375.0104.0DIESEL1598.085.0
1VOLKSWAGENGOLF1340.0119.0PETROL1498.0110.0
2VOLKSWAGENGOLF1320.0120.0PETROL1498.0110.0
3VOLKSWAGENPOLO1145.0104.0PETROL999.070.0
4VOLKSWAGENPOLO1145.0104.0PETROL999.070.0
\n", "
" ], "text/plain": [ " Fabricante CommercialName m (kg) Enedc (g/km) Ft ec (cm3) ep (KW)\n", "0 VOLKSWAGEN GOLF 1375.0 104.0 DIESEL 1598.0 85.0\n", "1 VOLKSWAGEN GOLF 1340.0 119.0 PETROL 1498.0 110.0\n", "2 VOLKSWAGEN GOLF 1320.0 120.0 PETROL 1498.0 110.0\n", "3 VOLKSWAGEN POLO 1145.0 104.0 PETROL 999.0 70.0\n", "4 VOLKSWAGEN POLO 1145.0 104.0 PETROL 999.0 70.0" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "XY=df.dropna()\n", "XY.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "Y=XY['Enedc (g/km)']\n", "X=XY[['m (kg)','ec (cm3)','ep (KW)']]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n", " normalize=False)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm=LinearRegression()\n", "lm.fit(X,Y)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.02414271, -0.00170046, 0.31175694])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm.coef_" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.494459098000691" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm.score(X,Y)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "fig = plt.figure(figsize=[12, 12])\n", "corr_mtx = XY.corr()\n", "sns.heatmap(corr_mtx, xticklabels=corr_mtx.columns, yticklabels=corr_mtx.columns, annot=True, cmap='Blues')\n", "plt.title('Correlation analysis')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FabricanteCommercialNamem (kg)Enedc (g/km)Ftec (cm3)ep (KW)
0VOLKSWAGENGOLF1375.0104.01.01598.085.0
1VOLKSWAGENGOLF1340.0119.00.01498.0110.0
2VOLKSWAGENGOLF1320.0120.00.01498.0110.0
3VOLKSWAGENPOLO1145.0104.00.0999.070.0
4VOLKSWAGENPOLO1145.0104.00.0999.070.0
\n", "
" ], "text/plain": [ " Fabricante CommercialName m (kg) Enedc (g/km) Ft ec (cm3) ep (KW)\n", "0 VOLKSWAGEN GOLF 1375.0 104.0 1.0 1598.0 85.0\n", "1 VOLKSWAGEN GOLF 1340.0 119.0 0.0 1498.0 110.0\n", "2 VOLKSWAGEN GOLF 1320.0 120.0 0.0 1498.0 110.0\n", "3 VOLKSWAGEN POLO 1145.0 104.0 0.0 999.0 70.0\n", "4 VOLKSWAGEN POLO 1145.0 104.0 0.0 999.0 70.0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2=df\n", "df2[\"Ft\"]=df2[\"Ft\"].replace(\"PETROL\",0)\n", "df2[\"Ft\"]=df2[\"Ft\"].replace(\"DIESEL\",1)\n", "XY2=df2.dropna()\n", "XY2.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "Y2=XY2['Enedc (g/km)']\n", "X2=XY2[['m (kg)','ec (cm3)','ep (KW)','Ft']]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n", " normalize=False)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm2=LinearRegression()\n", "lm2.fit(X2,Y2)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 5.40879228e-02, 1.72812603e-02, 4.57921958e-02, -2.66822078e+01])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm2.coef_" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6740223371787819" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm2.score(X2,Y2)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "29.705456863284923" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm2.intercept_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }