{ "cells": [ { "cell_type": "markdown", "id": "f3571d1d", "metadata": {}, "source": [ "### Lab7\n", "\n", "Consider the following files that have information about movies:\n", "https://github.com/masterfloss/datamovies/raw/main/movies_ratings.tsv (this dataset includes information related to ratings of IMDb)\n", "https://github.com/masterfloss/datamovies/raw/main/moviesPT3.xlsx (this dataset includes information related to the movies exhibited in Portugal)\n", "1.\tExplain the manipulation of the dataset\n", "2.\tCreate new columns (e.g., gross revenue by a spectator, gross revenue by session, spectators by session)\n", "3.\tCreate a regression model\n", "4.\tCreate a classification model\n", "5.\tUse unsupervised learning (cluster analysis and Dimensionally reduction)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "796ce725", "metadata": {}, "outputs": [], "source": [ "Read " ] }, { "cell_type": "code", "execution_count": 1, "id": "9c80bf16", "metadata": {}, "outputs": [], "source": [ "#For regression, use the follwoing libraies\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, explained_variance_score,r2_score\n", "from sklearn.neural_network import MLPRegressor\n", "from sklearn.linear_model import Ridge\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "code", "execution_count": 2, "id": "9d575728", "metadata": {}, "outputs": [], "source": [ "#For classification, use the follwoing libtaries\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "id": "9d04ea4a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "id": "eaa827d9", "metadata": {}, "outputs": [], "source": [ "df1=pd.read_csv('https://github.com/masterfloss/datamovies/raw/main/movies_ratings.tsv',sep='\\t')\n", "df2=pd.read_excel('https://github.com/masterfloss/datamovies/raw/main/moviesPT3.xlsx')" ] }, { "cell_type": "code", "execution_count": 7, "id": "b339513e", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "for i in df2.columns:\n", " df2[i].fillna(method ='ffill', inplace = True) " ] }, { "cell_type": "code", "execution_count": 9, "id": "084c6c74", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "for i in df2.index:\n", " if(df2['Title.1'][i]!=df2['Title'][i]):\n", " df2.drop(i,inplace=True)" ] }, { "cell_type": "code", "execution_count": 11, "id": "93f8ef50", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "df2=df2.rename(columns={\"ID Imdb\": \"tconst\"})\n", "df2=df2.merge(df1, on='tconst', how='left')" ] }, { "cell_type": "code", "execution_count": 13, "id": "82cd183d", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "Tdirctors=df2.groupby(by=[\"Director\"]).mean()\n", "Tdirctors=Tdirctors.reset_index()[['Director','averageRating']]\n", "df2=df2.merge(Tdirctors, on='Director', how='left')" ] }, { "cell_type": "code", "execution_count": 17, "id": "1153a2e7", "metadata": {}, "outputs": [], "source": [ "# Explain\n", "df2=df2.drop(columns=['CodeIGAC','ID Imdb1','distributor'])\n", "df2=df2.dropna()" ] }, { "cell_type": "code", "execution_count": 22, "id": "28fee967", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | exhibition year | \n", "number of sessions | \n", "number of spectators | \n", "gross revenue | \n", "averageRating_x | \n", "numVotes | \n", "averageRating_y | \n", "
---|---|---|---|---|---|---|---|
exhibition year | \n", "1.000000 | \n", "-0.115227 | \n", "-0.117738 | \n", "-0.089828 | \n", "0.030095 | \n", "-0.118093 | \n", "0.068265 | \n", "
number of sessions | \n", "-0.115227 | \n", "1.000000 | \n", "0.930709 | \n", "0.926633 | \n", "-0.071972 | \n", "0.383125 | \n", "-0.063365 | \n", "
number of spectators | \n", "-0.117738 | \n", "0.930709 | \n", "1.000000 | \n", "0.991244 | \n", "-0.024893 | \n", "0.374433 | \n", "-0.021391 | \n", "
gross revenue | \n", "-0.089828 | \n", "0.926633 | \n", "0.991244 | \n", "1.000000 | \n", "-0.020310 | \n", "0.376442 | \n", "-0.015741 | \n", "
averageRating_x | \n", "0.030095 | \n", "-0.071972 | \n", "-0.024893 | \n", "-0.020310 | \n", "1.000000 | \n", "0.268273 | \n", "0.873109 | \n", "
numVotes | \n", "-0.118093 | \n", "0.383125 | \n", "0.374433 | \n", "0.376442 | \n", "0.268273 | \n", "1.000000 | \n", "0.204394 | \n", "
averageRating_y | \n", "0.068265 | \n", "-0.063365 | \n", "-0.021391 | \n", "-0.015741 | \n", "0.873109 | \n", "0.204394 | \n", "1.000000 | \n", "