{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f3571d1d",
   "metadata": {},
   "source": [
    "### Lab7\n",
    "\n",
    "Consider the following files that have information about movies:\n",
    "https://github.com/masterfloss/datamovies/raw/main/movies_ratings.tsv (this dataset includes information related to ratings of IMDb)\n",
    "https://github.com/masterfloss/datamovies/raw/main/moviesPT3.xlsx (this dataset includes information related to the movies exhibited in Portugal)\n",
    "1.\tExplain the manipulation of the dataset\n",
    "2.\tCreate new columns (e.g., gross revenue by a spectator, gross revenue by session, spectators by session)\n",
    "3.\tCreate a regression model\n",
    "4.\tCreate a classification model\n",
    "5.\tUse unsupervised learning (cluster analysis and Dimensionally reduction)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "796ce725",
   "metadata": {},
   "outputs": [],
   "source": [
    "Read "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9c80bf16",
   "metadata": {},
   "outputs": [],
   "source": [
    "#For regression, use the follwoing libraies\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, explained_variance_score,r2_score\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.ensemble import RandomForestRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9d575728",
   "metadata": {},
   "outputs": [],
   "source": [
    "#For classification, use the follwoing libtaries\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9d04ea4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "eaa827d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1=pd.read_csv('https://github.com/masterfloss/datamovies/raw/main/movies_ratings.tsv',sep='\\t')\n",
    "df2=pd.read_excel('https://github.com/masterfloss/datamovies/raw/main/moviesPT3.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b339513e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explain\n",
    "for i in  df2.columns:\n",
    "    df2[i].fillna(method ='ffill', inplace = True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "084c6c74",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explain\n",
    "for i in df2.index:\n",
    "    if(df2['Title.1'][i]!=df2['Title'][i]):\n",
    "        df2.drop(i,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "93f8ef50",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explain\n",
    "df2=df2.rename(columns={\"ID Imdb\": \"tconst\"})\n",
    "df2=df2.merge(df1, on='tconst', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "82cd183d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explain\n",
    "Tdirctors=df2.groupby(by=[\"Director\"]).mean()\n",
    "Tdirctors=Tdirctors.reset_index()[['Director','averageRating']]\n",
    "df2=df2.merge(Tdirctors, on='Director', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "1153a2e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explain\n",
    "df2=df2.drop(columns=['CodeIGAC','ID Imdb1','distributor'])\n",
    "df2=df2.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "28fee967",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>exhibition  year</th>\n",
       "      <th>number of sessions</th>\n",
       "      <th>number of spectators</th>\n",
       "      <th>gross revenue</th>\n",
       "      <th>averageRating_x</th>\n",
       "      <th>numVotes</th>\n",
       "      <th>averageRating_y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>exhibition  year</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.115227</td>\n",
       "      <td>-0.117738</td>\n",
       "      <td>-0.089828</td>\n",
       "      <td>0.030095</td>\n",
       "      <td>-0.118093</td>\n",
       "      <td>0.068265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>number of sessions</th>\n",
       "      <td>-0.115227</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.930709</td>\n",
       "      <td>0.926633</td>\n",
       "      <td>-0.071972</td>\n",
       "      <td>0.383125</td>\n",
       "      <td>-0.063365</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>number of spectators</th>\n",
       "      <td>-0.117738</td>\n",
       "      <td>0.930709</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.991244</td>\n",
       "      <td>-0.024893</td>\n",
       "      <td>0.374433</td>\n",
       "      <td>-0.021391</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gross revenue</th>\n",
       "      <td>-0.089828</td>\n",
       "      <td>0.926633</td>\n",
       "      <td>0.991244</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.020310</td>\n",
       "      <td>0.376442</td>\n",
       "      <td>-0.015741</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>averageRating_x</th>\n",
       "      <td>0.030095</td>\n",
       "      <td>-0.071972</td>\n",
       "      <td>-0.024893</td>\n",
       "      <td>-0.020310</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.268273</td>\n",
       "      <td>0.873109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>numVotes</th>\n",
       "      <td>-0.118093</td>\n",
       "      <td>0.383125</td>\n",
       "      <td>0.374433</td>\n",
       "      <td>0.376442</td>\n",
       "      <td>0.268273</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.204394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>averageRating_y</th>\n",
       "      <td>0.068265</td>\n",
       "      <td>-0.063365</td>\n",
       "      <td>-0.021391</td>\n",
       "      <td>-0.015741</td>\n",
       "      <td>0.873109</td>\n",
       "      <td>0.204394</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      exhibition  year  number of sessions  \\\n",
       "exhibition  year              1.000000           -0.115227   \n",
       "number of sessions           -0.115227            1.000000   \n",
       "number of spectators         -0.117738            0.930709   \n",
       "gross revenue                -0.089828            0.926633   \n",
       "averageRating_x               0.030095           -0.071972   \n",
       "numVotes                     -0.118093            0.383125   \n",
       "averageRating_y               0.068265           -0.063365   \n",
       "\n",
       "                      number of spectators  gross revenue  averageRating_x  \\\n",
       "exhibition  year                 -0.117738      -0.089828         0.030095   \n",
       "number of sessions                0.930709       0.926633        -0.071972   \n",
       "number of spectators              1.000000       0.991244        -0.024893   \n",
       "gross revenue                     0.991244       1.000000        -0.020310   \n",
       "averageRating_x                  -0.024893      -0.020310         1.000000   \n",
       "numVotes                          0.374433       0.376442         0.268273   \n",
       "averageRating_y                  -0.021391      -0.015741         0.873109   \n",
       "\n",
       "                      numVotes  averageRating_y  \n",
       "exhibition  year     -0.118093         0.068265  \n",
       "number of sessions    0.383125        -0.063365  \n",
       "number of spectators  0.374433        -0.021391  \n",
       "gross revenue         0.376442        -0.015741  \n",
       "averageRating_x       0.268273         0.873109  \n",
       "numVotes              1.000000         0.204394  \n",
       "averageRating_y       0.204394         1.000000  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09eaa707",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}