{ "cells": [ { "cell_type": "markdown", "id": "f3571d1d", "metadata": {}, "source": [ "### Lab7\n", "\n", "Consider the following files that have information about movies:\n", "https://github.com/masterfloss/datamovies/raw/main/movies_ratings.tsv (this dataset includes information related to ratings of IMDb)\n", "https://github.com/masterfloss/datamovies/raw/main/moviesPT3.xlsx (this dataset includes information related to the movies exhibited in Portugal)\n", "1.\tExplain the manipulation of the dataset\n", "2.\tCreate new columns (e.g., gross revenue by a spectator, gross revenue by session, spectators by session)\n", "3.\tCreate a regression model\n", "4.\tCreate a classification model\n", "5.\tUse unsupervised learning (cluster analysis and Dimensionally reduction)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "796ce725", "metadata": {}, "outputs": [], "source": [ "Read " ] }, { "cell_type": "code", "execution_count": 1, "id": "9c80bf16", "metadata": {}, "outputs": [], "source": [ "#For regression, use the follwoing libraies\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, explained_variance_score,r2_score\n", "from sklearn.neural_network import MLPRegressor\n", "from sklearn.linear_model import Ridge\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "code", "execution_count": 2, "id": "9d575728", "metadata": {}, "outputs": [], "source": [ "#For classification, use the follwoing libtaries\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "id": "9d04ea4a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "id": "eaa827d9", "metadata": {}, "outputs": [], "source": [ "df1=pd.read_csv('https://github.com/masterfloss/datamovies/raw/main/movies_ratings.tsv',sep='\\t')\n", "df2=pd.read_excel('https://github.com/masterfloss/datamovies/raw/main/moviesPT3.xlsx')" ] }, { "cell_type": "code", "execution_count": 7, "id": "b339513e", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "for i in df2.columns:\n", " df2[i].fillna(method ='ffill', inplace = True) " ] }, { "cell_type": "code", "execution_count": 9, "id": "084c6c74", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "for i in df2.index:\n", " if(df2['Title.1'][i]!=df2['Title'][i]):\n", " df2.drop(i,inplace=True)" ] }, { "cell_type": "code", "execution_count": 11, "id": "93f8ef50", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "df2=df2.rename(columns={\"ID Imdb\": \"tconst\"})\n", "df2=df2.merge(df1, on='tconst', how='left')" ] }, { "cell_type": "code", "execution_count": 13, "id": "82cd183d", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "Tdirctors=df2.groupby(by=[\"Director\"]).mean()\n", "Tdirctors=Tdirctors.reset_index()[['Director','averageRating']]\n", "df2=df2.merge(Tdirctors, on='Director', how='left')" ] }, { "cell_type": "code", "execution_count": 17, "id": "1153a2e7", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "df2=df2.drop(columns=['CodeIGAC','ID Imdb1','distributor'])\n", "df2=df2.dropna()" ] }, { "cell_type": "code", "execution_count": 22, "id": "28fee967", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
exhibition yearnumber of sessionsnumber of spectatorsgross revenueaverageRating_xnumVotesaverageRating_y
exhibition year1.000000-0.115227-0.117738-0.0898280.030095-0.1180930.068265
number of sessions-0.1152271.0000000.9307090.926633-0.0719720.383125-0.063365
number of spectators-0.1177380.9307091.0000000.991244-0.0248930.374433-0.021391
gross revenue-0.0898280.9266330.9912441.000000-0.0203100.376442-0.015741
averageRating_x0.030095-0.071972-0.024893-0.0203101.0000000.2682730.873109
numVotes-0.1180930.3831250.3744330.3764420.2682731.0000000.204394
averageRating_y0.068265-0.063365-0.021391-0.0157410.8731090.2043941.000000
\n", "
" ], "text/plain": [ " exhibition year number of sessions \\\n", "exhibition year 1.000000 -0.115227 \n", "number of sessions -0.115227 1.000000 \n", "number of spectators -0.117738 0.930709 \n", "gross revenue -0.089828 0.926633 \n", "averageRating_x 0.030095 -0.071972 \n", "numVotes -0.118093 0.383125 \n", "averageRating_y 0.068265 -0.063365 \n", "\n", " number of spectators gross revenue averageRating_x \\\n", "exhibition year -0.117738 -0.089828 0.030095 \n", "number of sessions 0.930709 0.926633 -0.071972 \n", "number of spectators 1.000000 0.991244 -0.024893 \n", "gross revenue 0.991244 1.000000 -0.020310 \n", "averageRating_x -0.024893 -0.020310 1.000000 \n", "numVotes 0.374433 0.376442 0.268273 \n", "averageRating_y -0.021391 -0.015741 0.873109 \n", "\n", " numVotes averageRating_y \n", "exhibition year -0.118093 0.068265 \n", "number of sessions 0.383125 -0.063365 \n", "number of spectators 0.374433 -0.021391 \n", "gross revenue 0.376442 -0.015741 \n", "averageRating_x 0.268273 0.873109 \n", "numVotes 1.000000 0.204394 \n", "averageRating_y 0.204394 1.000000 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.corr()" ] }, { "cell_type": "code", "execution_count": null, "id": "09eaa707", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }