{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## PCA ##\n", "\n", "Explain the initial dataset\n", "\n", "What type of transformation was performed?\n", "\n", "What is the meaning of this transformation?\n", "\n", "What other type of processing may be performed?" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "\n", "df=pd.read_csv(\"https://raw.githubusercontent.com/masterfloss/data/main/worlddata.csv\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Country object\n", "Area_km2 int64\n", "Birth rate(births/1000 population) float64\n", "Current account balance float64\n", "Death rate(deaths/1000 population) float64\n", "Debt - external float64\n", "Electricity - consumption(kWh) float64\n", "Electricity - production(kWh) float64\n", "Exports float64\n", "GDP object\n", "GDPpercapita float64\n", "GDP - real growth rate(%) float64\n", "HIV/AIDS - adult prevalence rate(%) float64\n", "HIV/AIDS - deaths float64\n", "HIV/AIDS - people living with HIV/AIDS float64\n", "Highways(km) float64\n", "Imports float64\n", "Industrial production growth rate(%) float64\n", "Infant mortality rate(deaths/1000 live births) float64\n", "Inflation rate (consumer prices)(%) float64\n", "Internet hosts float64\n", "Internet users float64\n", "Investment (gross fixed)(% of GDP) float64\n", "Labor force float64\n", "Life expectancy at birth(years) float64\n", "Military expenditures - dollar figure float64\n", "MilitPercentGDP float64\n", "Natural gas - consumption(cu m) float64\n", "Natural gas - exports(cu m) float64\n", "Natural gas - imports(cu m) float64\n", "Natural gas - production(cu m) float64\n", "Natural gas - proved reserves(cu m) float64\n", "Oil - consumption(bbl/day) float64\n", "Oil - exports(bbl/day) float64\n", "Oil - imports(bbl/day) float64\n", "Oil - production(bbl/day) float64\n", "Oil - proved reserves(bbl) float64\n", "Population float64\n", "Public debt(% of GDP) float64\n", "Railways(km) float64\n", "Reserves of foreign exchange & gold float64\n", "Telephones - main lines in use float64\n", "Telephones - mobile cellular float64\n", "Total fertility rate(children born/woman) float64\n", "Unemploy rate(%) float64\n", "dtype: object" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "df['GDP']=pd.to_numeric(df['GDP'], downcast='float', errors='coerce')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(263, 45)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "df1=df\n", "#remove all columns that have more than 210 observations, df1[columns].count()<210\n", "for column1 in df1:\n", " if df1[column1].count()<200:\n", " df1=df1.drop([column1],axis=1)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "df1=df1.dropna()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "df1=df1.drop(['Country'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "\n", "from sklearn.preprocessing import StandardScaler\n", "standardizer=StandardScaler()\n", "X=standardizer.fit_transform(df1)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "\n", "PCAModel = PCA(n_components=4)\n", "XPCA = PCAModel.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original number of features: 23\n", "Reduced number of features: 4\n" ] } ], "source": [ "\n", "print('Original number of features:', X.shape[1])\n", "print('Reduced number of features:', XPCA.shape[1])" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.47834598, 0.1899588 , 0.07468206, 0.05152297])" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "PCAModel.explained_variance_ratio_" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Area_km2',\n", " 'Birth rate(births/1000 population)',\n", " 'Death rate(deaths/1000 population)',\n", " 'Debt - external',\n", " 'Electricity - consumption(kWh)',\n", " 'Electricity - production(kWh)',\n", " 'Exports',\n", " 'GDP',\n", " 'GDPpercapita',\n", " 'GDP - real growth rate(%)',\n", " 'Highways(km)',\n", " 'Imports',\n", " 'Infant mortality rate(deaths/1000 live births)',\n", " 'Inflation rate (consumer prices)(%)',\n", " 'Internet users',\n", " 'Labor force',\n", " 'Life expectancy at birth(years)',\n", " 'Oil - consumption(bbl/day)',\n", " 'Oil - production(bbl/day)',\n", " 'Population',\n", " 'Telephones - main lines in use',\n", " 'Telephones - mobile cellular',\n", " 'Total fertility rate(children born/woman)']" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compScores=PCAModel.components_.T\n", "columnList = list (df1.columns.values)\n", "columnList" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "#dfscores=pd.DataFrame(compScores, columns=columnList)\n", "dfscores=pd.DataFrame(compScores)\n", "dfscores['variables']=columnList\n", "dfscores=dfscores.set_index('variables')" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
0 | 1 | 2 | 3 | |
---|---|---|---|---|
variables | ||||
Area_km2 | \n", "0.196918 | \n", "0.078018 | \n", "0.050004 | \n", "0.318586 | \n", "
Birth rate(births/1000 population) | \n", "-0.089315 | \n", "0.409577 | \n", "-0.064021 | \n", "0.092177 | \n", "
Death rate(deaths/1000 population) | \n", "-0.048461 | \n", "0.328146 | \n", "-0.200144 | \n", "-0.244899 | \n", "
Debt - external | \n", "0.130537 | \n", "-0.045956 | \n", "-0.295037 | \n", "-0.172489 | \n", "
Electricity - consumption(kWh) | \n", "0.288651 | \n", "0.062448 | \n", "-0.119725 | \n", "0.055438 | \n", "
Electricity - production(kWh) | \n", "0.290630 | \n", "0.064372 | \n", "-0.097005 | \n", "0.044311 | \n", "
Exports | \n", "0.274613 | \n", "-0.031945 | \n", "-0.120021 | \n", "-0.073495 | \n", "
GDP | \n", "0.294102 | \n", "0.072188 | \n", "-0.012551 | \n", "-0.040399 | \n", "
GDPpercapita | \n", "0.108571 | \n", "-0.304671 | \n", "-0.237989 | \n", "-0.117714 | \n", "
GDP - real growth rate(%) | \n", "0.016927 | \n", "0.009405 | \n", "0.168792 | \n", "0.633484 | \n", "
Highways(km) | \n", "0.270373 | \n", "0.068151 | \n", "-0.109271 | \n", "0.039071 | \n", "
Imports | \n", "0.278818 | \n", "-0.001610 | \n", "-0.235177 | \n", "-0.070954 | \n", "
Infant mortality rate(deaths/1000 live births) | \n", "-0.078195 | \n", "0.430071 | \n", "-0.051599 | \n", "0.018524 | \n", "
Inflation rate (consumer prices)(%) | \n", "-0.027746 | \n", "0.174029 | \n", "-0.056930 | \n", "0.000404 | \n", "
Internet users | \n", "0.289896 | \n", "0.055756 | \n", "-0.071564 | \n", "-0.066624 | \n", "
Labor force | \n", "0.199585 | \n", "0.113688 | \n", "0.508396 | \n", "-0.150078 | \n", "
Life expectancy at birth(years) | \n", "0.083859 | \n", "-0.431921 | \n", "0.094147 | \n", "0.106816 | \n", "
Oil - consumption(bbl/day) | \n", "0.276043 | \n", "0.047094 | \n", "-0.216795 | \n", "0.076420 | \n", "
Oil - production(bbl/day) | \n", "0.184365 | \n", "0.026926 | \n", "-0.100888 | \n", "0.512215 | \n", "
Population | \n", "0.194928 | \n", "0.118476 | \n", "0.500456 | \n", "-0.129348 | \n", "
Telephones - main lines in use | \n", "0.276921 | \n", "0.078700 | \n", "0.203591 | \n", "-0.102200 | \n", "
Telephones - mobile cellular | \n", "0.267611 | \n", "0.057041 | \n", "0.199527 | \n", "-0.159440 | \n", "
Total fertility rate(children born/woman) | \n", "-0.083487 | \n", "0.400337 | \n", "-0.091452 | \n", "0.083502 | \n", "