Nowadays everyone talks about 'artificial intelligence', in every research paper one can read 'deep learning'. In reality many of the work is done by traditional machine learning models. These models live with us for decades now and do a descent job with their limitations.
Usually traditional machine learning models are easier to interpret and understand their learnt underlying decision process than neural networks. The rule of thumb is that when it comes to images, audio files or text, modern neural networks are superior than traditional machine learning methods, on the other hand when we work with traditional tabular data it often goes the opposite way. It worth to study the winning solutions on Kaggle to see the usual winning models for different datatypes.
import numpy as np # np array & math
import pandas as pd # to handle data table
import seaborn as sns # high-level plotting package, built on matplotlib
import matplotlib.pyplot as plt # lower-level plotting package
from collections import Counter
# super useful function to count objects in lists
%matplotlib inline
# to have the plots displayed within the notebook
from sklearn import datasets, cluster
from sklearn import neighbors, ensemble, tree, linear_model
from sklearn import model_selection, metrics
# sklearn in the most popular machine learning library in python
data = datasets.load_breast_cancer()
Fine needle aspirate (FNA) of a breast mass. The dataset contains extracted information of cell nuclei from digitized images.
A few extracted information:
compactness
The target variable is binary, the diagnosis (M malignant - 0, B benign - 1).
Image: By Ed Uthman from Houston, TX, USA - Pancreas FNA; adenocarcinoma vs. normal ductal epithelium (400x)Uploaded by CFCF, CC BY 2.0, https://commons.wikimedia.org/w/index.php?curid=30103637
X = pd.DataFrame(data['data'])
X.columns = data['feature_names']
X.head()
Counter(data['target'])
data['target_names']
X.describe()
plt.imshow(X.T.isna())
print(f'# NAs: {X.isna().sum().sum()}')
f'{1+1}foo{"bar".replace("a", " ")}'
# cool conversion of python functions to string
# above Python 3.6
X['target'] = data.target
plt.hist(X[X['target'] == 0]['mean radius'], alpha = 0.7, label='malignant', bins=20)
plt.hist(X[X['target'] == 1]['mean radius'], alpha = 0.7, label='benign', bins=20)
plt.xlabel('mean radius', fontsize=15)
plt.legend(fontsize=15)
plt.figure(figsize=(20, 7))
data2 = pd.concat([X.target, ((X-X.mean())/X.std()).drop('target', 1)], axis=1)
data2 = pd.melt(data2,id_vars="target",
var_name="features",
value_name='value')
sns.violinplot(x="features", y="value", hue="target", data=data2, split=True, inner="quart")
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(14, 14))
sns.heatmap(X.corr(), annot=True, linewidths=.5, fmt= '.1f')
plt.figure(figsize=(14, 14))
sns.clustermap(X.T)
sns.clustermap(((X-X.mean())/X.std()).T)
kmeans = cluster.KMeans(n_clusters=2, random_state=42)
rf = ensemble.RandomForestClassifier(random_state=42)
dt = tree.DecisionTreeClassifier()
lr = linear_model.LogisticRegression()
knn = neighbors.KNeighborsClassifier(5)
# random states are important for reproducibility
kmeans_clusters = kmeans.fit_predict(X.drop('target', 1))
(kmeans_clusters == X['target']).mean()
KMeans does not know which cluster is which, it is unsupervised!
kmeans_clusters = np.array([1 if i == 0 else 0 for i in kmeans_clusters])
(kmeans_clusters == X['target']).mean()
cm = metrics.confusion_matrix(y_true=X['target'],y_pred=kmeans_clusters)
sns.heatmap(cm,annot=True,fmt="d")
plt.xlabel('prediction', fontsize=15)
plt.ylabel('label', fontsize=15)
rf_preds = model_selection.cross_val_predict(rf, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
tree_preds = model_selection.cross_val_predict(dt, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
lr_preds = model_selection.cross_val_predict(lr, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
knn_preds = model_selection.cross_val_predict(knn, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
plt.figure(figsize=(8, 8))
for idx, preds in enumerate([rf_preds, tree_preds, lr_preds, knn_preds]):
fpr, tpr, _ = metrics.roc_curve(y_score=preds[:,1], y_true=data['target'])
auc = np.round(metrics.roc_auc_score(y_score=preds[:,1], y_true=data['target']), 3)
plt.plot(fpr, tpr, label=['random forest', 'decision tree',
'logistic regression', 'knn'][idx] + f': {auc}')
plt.legend(fontsize=15)
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
Remember: ROC curve is generated with sweeping the probability threshold! Can these models provide real, continuous probabilities?
X = X.set_index('target')
X = (X - X.mean())/X.std()
X = X.reset_index()
rf_preds2 = model_selection.cross_val_predict(rf, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
tree_preds2 = model_selection.cross_val_predict(dt, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
lr_preds2 = model_selection.cross_val_predict(lr, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
knn_preds2 = model_selection.cross_val_predict(knn, X.drop('target', 1),
X['target'], method='predict_proba', cv=5)
plt.figure(figsize=(16, 8))
plt.subplot(121)
for idx, preds in enumerate([rf_preds, tree_preds, lr_preds, knn_preds]):
fpr, tpr, _ = metrics.roc_curve(y_score=preds[:,1], y_true=data['target'])
auc = np.round(metrics.roc_auc_score(y_score=preds[:,1], y_true=data['target']), 3)
plt.plot(fpr, tpr, label=['random forest', 'decision tree',
'logistic regression', 'knn'][idx] + f': {auc}')
plt.legend(fontsize=15)
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title('Without normalization', fontsize=20)
plt.subplot(122)
for idx, preds in enumerate([rf_preds2, tree_preds2, lr_preds2, knn_preds2]):
fpr, tpr, _ = metrics.roc_curve(y_score=preds[:,1], y_true=data['target'])
auc = np.round(metrics.roc_auc_score(y_score=preds[:,1], y_true=data['target']), 3)
plt.plot(fpr, tpr, label=['random forest', 'decision tree',
'logistic regression', 'knn'][idx] + f': {auc}')
plt.legend(fontsize=15)
plt.plot([0, 1], [0, 1], '--', c='k')
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title('With normalization', fontsize=20)
plt.show()
Also there are models outside of sklearn. Such as