{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Clusters\n", "\n", "Author: Carlos J. Costa, ISEG\n", "\n", "Purpose: Identify clusters in a random generated blobs sample\n", "\n", "**1** import libraries needed:numpy, sklearn, matplotlib and pandas\n", "\n", "**2** generate a sample of blobs and convert it into a dataframe called df1\n", "\n", "**3** Verify datatype\n", "\n", "**4** Plot the blobs\n", "\n", "**5** calculete WCSS\n", "\n", "**6** plot the new chart with centroids\n", "\n", "**7** identify to what group does each item belongs\n", "\n", "**8** add new column\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from matplotlib import pyplot as plt\n", "from sklearn.cluster import KMeans" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets.samples_generator import make_blobs\n", "XY,y= make_blobs(n_samples=400, centers=5, cluster_std=0.60, random_state=0)\n", "df1=pd.DataFrame(XY)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.scatter(df1[0], df1[1])\n", "plt.title('Blobs')\n", "plt.xlabel('X')\n", "plt.ylabel('Y')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wcss = []\n", "for i in range(1, 11):\n", " kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)\n", " kmeans.fit(df1)\n", " wcss.append(kmeans.inertia_)\n", "plt.plot(range(1, 11), wcss)\n", "plt.title('Elbow Method')\n", "plt.xlabel('Number of clusters')\n", "plt.ylabel('WCSS')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=400, n_init=10, random_state=0)\n", "pred_y = kmeans.fit_predict(df1)\n", "plt.scatter(df1[0], df1[1])\n", "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=400, n_init=10, random_state=0)\n", "pred_y = kmeans.fit_predict(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pred_y" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pred_y.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df2=pd.DataFrame(pred_y, columns=['groups'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "XY2=df1.merge(df2,left_index=True, right_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "XY2.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }