The woldbank_development_2015.csv (can be found in the same folder with this notebook) file contains the World Development Indicators for the 2015 year, downloaded from The World Bank's webpage.
The woldbank_development_2015.csv (can be found in the same folder with this notebook) file contains the World Development Indicators for the 2015 year, downloaded from The World Bank's webpage.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
long = pd.read_csv('woldbank_development_2015.csv', na_values=['..', '...'])
long.head()
pd.unique(long['Country Name'])[210:225]
%%time
series_codes = pd.unique(long['Series Code'].dropna())
wide = pd.DataFrame()
# had to manually split when countries stops and regions starts
for c in pd.unique(long['Country Name'])[:217]:
tmp = long[long['Country Name'] == c][['Series Code', '2015 [YR2015]']].set_index('Series Code').T
# tmp is here:
## index: 2015 [YR2015]
## columns: Series Code
tmp['country'] = c
wide = wide.append(tmp[['country'] + list(series_codes)])
wide = wide.set_index('country').astype(float)
wide.head()
# keep only the countries first, then pivot it
long2 = long[long['Country Name'].isin(pd.unique(long['Country Name'])[:217])]
pivoted = long2.pivot(index='Country Name', columns='Series Code', values='2015 [YR2015]')
pivoted[series_codes].head()
np.allclose(pivoted[series_codes].fillna(0), wide.fillna(0))
# in general we do not want to compare floats with == too much
np.allclose??
wide.isna()
plt.imshow(wide.isna())
keep_cols = wide.isna().sum()[wide.isna().sum() < 20].index.values
keep_rows = wide.isna().sum(1)[wide.isna().sum(1) < 700].index.values
filtered_wide = wide[wide.index.isin(keep_rows)][keep_cols]
filtered_wide.shape
plt.imshow(filtered_wide.isna())
filtered_wide = filtered_wide.fillna(filtered_wide.mean())
long[long['Series Code'].isin(keep_cols)]['Series Name'].drop_duplicates().tolist()
pca = PCA(3)
pca_transformed = pca.fit_transform(filtered_wide)
plt.figure(figsize=(17, 6))
plt.subplot(131)
plt.scatter(pca_transformed[:,0], pca_transformed[:,1])
plt.subplot(132)
plt.scatter(pca_transformed[:,0], pca_transformed[:,2])
plt.subplot(133)
plt.scatter(pca_transformed[:,1], pca_transformed[:,2])
filtered_scaled_wide = (filtered_wide - filtered_wide.mean())/filtered_wide.std()
pca = PCA(3)
pca_transformed = pca.fit_transform(filtered_scaled_wide)
plt.figure(figsize=(17, 6))
plt.subplot(131)
plt.scatter(pca_transformed[:,0], pca_transformed[:,1])
plt.subplot(132)
plt.scatter(pca_transformed[:,0], pca_transformed[:,2])
plt.subplot(133)
plt.scatter(pca_transformed[:,1], pca_transformed[:,2])
tsne = TSNE()
ts_transformed = tsne.fit_transform(filtered_scaled_wide)
plt.scatter(ts_transformed[:,0], ts_transformed[:,1])
countries = filtered_scaled_wide.index.tolist()
plt.figure(figsize=(16, 12))
plt.scatter(ts_transformed[:,0], ts_transformed[:,1])
for idx, c in enumerate(countries):
if c in ['Hungary', 'Norway']:
plt.text(ts_transformed[idx, 0], ts_transformed[idx, 1], c, fontsize=15)
plt.scatter([ts_transformed[idx,0]], [ts_transformed[idx,1]], c='r', s=150)
plt.text(ts_transformed[idx, 0], ts_transformed[idx, 1], c, fontsize=8)
plt.figure(figsize=(30, 20))
plt.scatter(ts_transformed[:,0], ts_transformed[:,1])
for idx, c in enumerate(countries):
if c in ['Hungary', 'Norway']:
plt.text(ts_transformed[idx, 0], ts_transformed[idx, 1], c, fontsize=15)
plt.scatter([ts_transformed[idx,0]], [ts_transformed[idx,1]], c='r', s=150)
plt.text(ts_transformed[idx, 0], ts_transformed[idx, 1], c, fontsize=8)
sns.clustermap(filtered_scaled_wide, col_cluster=False, figsize=(10, 45))