{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clusters\n",
    "\n",
    "Author: Carlos J. Costa, ISEG\n",
    "\n",
    "Purpose: Identify clusters in a random generated blobs sample\n",
    "\n",
    "**1** import libraries needed:numpy, sklearn, matplotlib and pandas\n",
    "\n",
    "**2** generate a sample of blobs and convert it into a dataframe called df1\n",
    "\n",
    "**3** Verify datatype\n",
    "\n",
    "**4** Plot the blobs\n",
    "\n",
    "**5** calculete WCSS\n",
    "\n",
    "**6** plot the new chart with centroids\n",
    "\n",
    "**7** identify to what group does each item belongs\n",
    "\n",
    "**8** add new column\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.cluster import KMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets.samples_generator import make_blobs\n",
    "XY,y= make_blobs(n_samples=400, centers=5, cluster_std=0.60, random_state=0)\n",
    "df1=pd.DataFrame(XY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(df1[0], df1[1])\n",
    "plt.title('Blobs')\n",
    "plt.xlabel('X')\n",
    "plt.ylabel('Y')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wcss = []\n",
    "for i in range(1, 11):\n",
    "    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)\n",
    "    kmeans.fit(df1)\n",
    "    wcss.append(kmeans.inertia_)\n",
    "plt.plot(range(1, 11), wcss)\n",
    "plt.title('Elbow Method')\n",
    "plt.xlabel('Number of clusters')\n",
    "plt.ylabel('WCSS')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=400, n_init=10, random_state=0)\n",
    "pred_y = kmeans.fit_predict(df1)\n",
    "plt.scatter(df1[0], df1[1])\n",
    "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=400, n_init=10, random_state=0)\n",
    "pred_y = kmeans.fit_predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2=pd.DataFrame(pred_y, columns=['groups'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "XY2=df1.merge(df2,left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "XY2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}