{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Regression using sklearn\n", "\n", "Author: Carlos J. Costa, ISEG\n", "\n", "Purpose: Identify the weight of several features in the home prices in Boston.\n", "\n", "**1** import libraries needed: sklearn and pandas\n", "\n", "**2** use boston dataset from https://scikit-learn.org/stable/datasets/index.html and convert into two dataframes: X for the features and Y for the target.\n", "\n", "**3** Verify 5 lines of the features variables\n", "\n", "**4** Veify datatype\n", "\n", "**5** Create new features variables with only 2 variables\n", "\n", "**6** Create and fit the model: model = LinearRegression().fit(features, target)\n", "\n", "**7** obtain intercept\n", "\n", "**8** obtain coeficients \n", "\n", "**9** obtain R^2\n", "\n", "Comment the following code:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#\n", "from sklearn.linear_model import LinearRegression\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#\n", "from sklearn.datasets import load_boston\n", "boston =load_boston()\n", "X = pd.DataFrame(boston.data, columns=boston.feature_names)\n", "Y = pd.DataFrame(boston.target)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
\n", "
" ], "text/plain": [ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", "\n", " PTRATIO B LSTAT \n", "0 15.3 396.90 4.98 \n", "1 17.8 396.90 9.14 \n", "2 17.8 392.83 4.03 \n", "3 18.7 394.63 2.94 \n", "4 18.7 396.90 5.33 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CRIM float64\n", "ZN float64\n", "INDUS float64\n", "CHAS float64\n", "NOX float64\n", "RM float64\n", "AGE float64\n", "DIS float64\n", "RAD float64\n", "TAX float64\n", "PTRATIO float64\n", "B float64\n", "LSTAT float64\n", "dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "X.dtypes" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#\n", "X1 = X.iloc[:,0:2]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#\n", "regression = LinearRegression()\n", "model = regression.fit(X1, Y)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([22.48562811])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "model.intercept_" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.35207832, 0.11610909]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "model.coef_" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.23398843834155303" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.score(X1,Y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }