The credit card fraud dataset contains anonymized credit card transactions labeled as fraudulent or geniune. The dataset is highly unbalanced, the positive class (frauds) account for a very small percentage of all transactions.
The attributes are already the result of a PCA transformation, and the original features and more background information about the data are anonymized with numerical values. The only features that have not been transformed with PCA are "Time" and "Amount". The feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.
import numpy as np
import pandas as pd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv('./creditcardfraud.csv')
data.shape
(284807, 31)
data.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
data.isnull().values.sum()
0
transaction_class = data.Class.value_counts().reset_index()
transaction_class
index | Class | |
---|---|---|
0 | 0 | 284315 |
1 | 1 | 492 |
transaction_class.rename(columns={'index':'Transaction Type', 'Class':'Frequency'}, inplace=True)
transaction_class['Transaction Type'] = transaction_class['Transaction Type'].map({0:'Normal',1:'Fraud'})
transaction_class['Percentage'] = transaction_class['Frequency']/transaction_class['Frequency'].sum()
transaction_class
Transaction Type | Frequency | Percentage | |
---|---|---|---|
0 | Normal | 284315 | 0.998273 |
1 | Fraud | 492 | 0.001727 |
transaction_class_fig = px.bar(transaction_class, y='Transaction Type', x='Frequency', height=300, width=800,
text=['Frequency:{}<br>Percentage:{:.2f}%'.format(a,100*b) for a,b in zip(transaction_class['Frequency'],transaction_class['Percentage'])],
orientation='h'
)
transaction_class_fig.update_layout(title={'text':'Transaction Types', 'y':0.97, 'x':0.5, 'xanchor':'center', 'yanchor':'top'})
def draw_hist(dataframe, x_name, x_rename, title, color, nbins_count = 100, height = 400, width = 900, log_y=False):
fig = px.histogram(dataframe, x=x_name, nbins=nbins_count, labels={x_name:x_rename},
opacity=0.9, color_discrete_sequence=[color], # color of histogram bars
height= height, width = width, log_y = log_y,
# range_x = [dataframe[x_name].min(),dataframe[x_name].max()],
# marginal='box',
)
fig.update_layout(title={'text':title, 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
fig.show()
normal_transaction = data[data['Class'] == 0]
fraud_transaction = data[data['Class'] == 1]
draw_hist(normal_transaction, 'Time', 'Transaction Time', 'Normal Transaction Time Distribution', 'rgb(156,219,165)')
draw_hist(fraud_transaction, 'Time', 'Transaction Time', 'Fraud Transaction Time Distribution', 'rgb(214,96,77)')
draw_hist(normal_transaction, 'Time', 'Transaction Time', 'Normal Transaction Time Distribution', 'rgb(156,219,165)', log_y=True)
draw_hist(fraud_transaction, 'Time', 'Transaction Time', 'Fraud Transaction Time Distribution', 'rgb(214,96,77)', log_y=True)
normal_amount_box = px.box(normal_transaction, x='Amount', width = 700, height = 200)
normal_amount_box.update_layout(title={'text':'Normal Transaction Amount Distribution', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
normal_amount_box.show()
fraud_amount_box = px.box(fraud_transaction, x='Amount', width = 700, height = 200)
fraud_amount_box.update_layout(title={'text':'Fraud Transaction Amount Distribution', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
fraud_amount_box.show()
draw_hist(normal_transaction, 'Amount', 'Transaction Amount', 'Normal Transaction Amount Distribution', 'rgb(156,219,165)', nbins_count=30, height = 300, width = 700)
draw_hist(fraud_transaction, 'Amount', 'Transaction Amount', 'Fraud Transaction Amount Distribution', 'rgb(214,96,77)', nbins_count=30, height = 300, width = 700)
normal_time_amount_fig = px.scatter(normal_transaction, x='Time', y = 'Amount', height = 380, width = 700)
normal_time_amount_fig.update_layout(title={'text':'Normal Transaction Time vs Amount', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
normal_time_amount_fig.show()
fraud_time_amount_fig = px.scatter(fraud_transaction, x='Time', y = 'Amount', height = 380, width = 700)
fraud_time_amount_fig.update_layout(title={'text':'Fraud Transaction Time vs Amount', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
fraud_time_amount_fig.show()
Data preprocessing is highly needed due to unbalanced samples.
from sklearn.preprocessing import StandardScaler, RobustScaler
std_scaler = StandardScaler()
data['stdScaledAmount'] = std_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['stdScaledTime'] = std_scaler.fit_transform(data['Time'].values.reshape(-1,1))
print('Original Amount:',data['Amount'].values.reshape(-1,1))
print('Std Scaled Amount:',std_scaler.fit_transform(data['Amount'].values.reshape(-1,1)))
print('Original Time:',data['Time'].values.reshape(-1,1))
print('Std Scaled Time:',std_scaler.fit_transform(data['Time'].values.reshape(-1,1)))
Original Amount: [[149.62] [ 2.69] [378.66] ... [ 67.88] [ 10. ] [217. ]] Std Scaled Amount: [[ 0.24496426] [-0.34247454] [ 1.16068593] ... [-0.0818393 ] [-0.31324853] [ 0.51435531]] Original Time: [[0.00000e+00] [0.00000e+00] [1.00000e+00] ... [1.72788e+05] [1.72788e+05] [1.72792e+05]] Std Scaled Time: [[-1.99658302] [-1.99658302] [-1.99656197] ... [ 1.6419735 ] [ 1.6419735 ] [ 1.64205773]]
rub_scaler = RobustScaler()
data['rubScaledAmount'] = rub_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['rubScaledTime'] = rub_scaler.fit_transform(data['Time'].values.reshape(-1,1))
print('Original Amount:',data['Amount'].values.reshape(-1,1))
print('Rub Scaled Amount:',rub_scaler.fit_transform(data['Amount'].values.reshape(-1,1)))
print('Original Time:',data['Time'].values.reshape(-1,1))
print('Rub Scaled Time:',rub_scaler.fit_transform(data['Time'].values.reshape(-1,1)))
Original Amount: [[149.62] [ 2.69] [378.66] ... [ 67.88] [ 10. ] [217. ]] Rub Scaled Amount: [[ 1.78327395] [-0.26982463] [ 4.98372109] ... [ 0.64109551] [-0.16767973] [ 2.72479564]] Original Time: [[0.00000e+00] [0.00000e+00] [1.00000e+00] ... [1.72788e+05] [1.72788e+05] [1.72792e+05]] Rub Scaled Time: [[-0.99498349] [-0.99498349] [-0.99497175] ... [ 1.03497457] [ 1.03497457] [ 1.03502156]]
new_data = data.drop(['Time','Amount'], axis = 1)
new_data.head()
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V24 | V25 | V26 | V27 | V28 | Class | stdScaledAmount | stdScaledTime | rubScaledAmount | rubScaledTime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | ... | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 0 | 0.244964 | -1.996583 | 1.783274 | -0.994983 |
1 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | ... | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 0 | -0.342475 | -1.996583 | -0.269825 | -0.994983 |
2 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | ... | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 0 | 1.160686 | -1.996562 | 4.983721 | -0.994972 |
3 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | ... | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 0 | 0.140534 | -1.996562 | 1.418291 | -0.994972 |
4 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | ... | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 0 | -0.073403 | -1.996541 | 0.670579 | -0.994960 |
5 rows × 33 columns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
X = new_data.iloc[:, new_data.columns != 'Class']
y = new_data.iloc[:, new_data.columns == 'Class']
print('X shape:',X.shape, '\ny shape:',y.shape)
X shape: (284807, 32) y shape: (284807, 1)
sss = StratifiedShuffleSplit(n_splits=10,test_size=0.3,train_size=None, random_state=80)
# Alternative: Random sampling
# X = np.array(new_data.iloc[:, new_data.columns != 'Class'])
# y = np.array(new_data.iloc[:, new_data.columns == 'Class'])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=80)
for train_index, test_index in sss.split(X,y):
print('Train:', train_index, 'Test:', test_index)
original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
original_X_train = np.array(original_Xtrain)
original_X_test = np.array(original_Xtest)
original_y_train = np.array(original_ytrain)
original_y_test = np.array(original_ytest)
Train: [277279 271167 243548 ... 230480 200350 106351] Test: [234266 275718 133564 ... 244125 28889 214407] Train: [106342 32307 134295 ... 16289 55959 113257] Test: [145064 105889 141570 ... 113839 57961 74335] Train: [ 62862 48218 280526 ... 73828 8090 147350] Test: [142773 273368 15299 ... 246861 187187 168339] Train: [ 89779 111045 68471 ... 119218 186855 113718] Test: [138949 230823 37236 ... 27393 14353 240755] Train: [250963 29495 63061 ... 12292 250661 270470] Test: [176901 81122 97173 ... 171813 50060 251697] Train: [184435 266521 53453 ... 274760 107306 3101] Test: [165220 227906 6723 ... 267331 14144 5792] Train: [ 17871 69032 190049 ... 68457 86789 137289] Test: [ 27873 32738 32516 ... 180288 266560 240278] Train: [254640 199341 142105 ... 12670 101875 198010] Test: [271978 26865 233939 ... 192333 53941 79180] Train: [128590 147863 195521 ... 207013 191362 282945] Test: [207007 253186 5845 ... 250124 12579 208115] Train: [273617 233366 205416 ... 66225 263933 83687] Test: [174185 95912 279859 ... 275455 57898 201168]
train_unique_label, train_label_count = np.unique(original_y_train, return_counts=True)
test_unique_label, test_label_count = np.unique(original_y_test, return_counts=True)
train_label_rate = train_label_count / len(original_ytrain)
test_label_rate = test_label_count / len(original_ytest)
print("\nTrain Set Label {}:{}, Label {}:{}".format(train_unique_label[0],train_label_count[0], train_unique_label[1],train_label_count[1] ))
print("\nTrain Set Label Distribution:")
for label,rate in zip(train_unique_label, train_label_rate):
print('Label {} Percentage:{:.2f}%'.format(label,100*rate))
print("\nTest Set Label {}:{}个, Label {}:{}个".format(test_unique_label[0],test_label_count[0], test_unique_label[1],test_label_count[1] ))
print("\nTest Set Label Distribution:")
for label,rate in zip(test_unique_label,test_label_rate):
print('Label {} Percentage:{:.2f}%'.format(label,100*rate))
Train Set Label 0:199020, Label 1:344 Train Set Label Distribution: Label 0 Percentage:99.83% Label 1 Percentage:0.17% Test Set Label 0:85295个, Label 1:148个 Test Set Label Distribution: Label 0 Percentage:99.83% Label 1 Percentage:0.17%
# scikit-learn needs to be 1.2.2 (https://github.com/scikit-learn-contrib/imbalanced-learn/issues/995)
# pip install imbalanced-learn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek
ros = RandomOverSampler(random_state = 0)
sos = SMOTE(random_state = 0)
kos = SMOTETomek(random_state = 0)
X_ros, y_ros = ros.fit_sample(original_X_train, original_y_train)
X_sos, y_sos = sos.fit_sample(original_X_train, original_y_train)
X_kos, y_kos = kos.fit_sample(original_X_train, original_y_train)
print('ros:{}, sos:{}, kos:{}'.format(len(y_ros), len(y_sos), len(y_kos)))
ros:398040, sos:398040, kos:398040
print('Training set fraud sample, original training set:{} \nTraining set after RandomOverSampler:{}, Training set after SMOTE:{}, Training set after SMOTETomek:{}\
'.format(original_y_train.sum(), y_ros.sum(), y_sos.sum(), y_kos.sum()))
print('Fraud percentage in the original training set {:.2f}%, After oversampling fraud sample percentage increases to:{:.0f}%'.format(100*original_y_train.sum()/len(original_y_train), 100*y_ros.sum()/len(y_ros)))
Training set fraud sample, original training set:344 Training set after RandomOverSampler:199020, Training set after SMOTE:199020, Training set after SMOTETomek:199020 Fraud percentage in the original training set 0.17%, After oversampling fraud sample percentage increases to:50%
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_curve, auc, recall_score, classification_report
lr = LogisticRegression(max_iter=500, C=100)
#paramaters = {'C':[0.01,0.1,1,5,10,100]}
# 10 folds, n jobs run in parallel
#lr_cv1 = GridSearchCV(lr, param_grid = paramaters, cv=10, n_jobs=-1, verbose=5, scoring='f1') # n_jobs=-1
data = [[original_X_train, original_y_train],
[X_ros, y_ros],
[X_sos, y_sos],
[X_kos, y_kos]]
for features, labels in data:
lr.fit(features, labels)
#lr_cv1.fit(features, labels)
predict_test = lr.predict(original_X_test)
print('AUC:{} Recall:{} Precision:{}'.format(
metrics.roc_auc_score(original_y_test, predict_test),
metrics.recall_score(original_y_test, predict_test),
metrics.precision_score(original_y_test, predict_test)
))
AUC:0.8310341850144887 Recall:0.6621621621621622 Precision:0.9245283018867925 AUC:0.9499121094833036 Recall:0.9256756756756757 Precision:0.058497011101622545 AUC:0.9493317706592225 Recall:0.9256756756756757 Precision:0.05612453912331012 AUC:0.9493317706592225 Recall:0.9256756756756757 Precision:0.05612453912331012
lr2 = LogisticRegression(max_iter = 500, C=100, class_weight={0:1,1:5})
#param_grid= {'C':[0.01,0.1,1,5,10,100],
# 'class_weight':[{0:1,1:3}, {0:1,1:5},{0:1,1:10}, {0:1,1:15}]
# }
#lr_cv2 = GridSearchCV(lr, param_grid = param_grid, cv=10, n_jobs=-1, verbose=5, scoring='f1') # n_jobs=-1
#
#lr_cv2.fit(original_X_train, original_y_train)
#predict2 = lr_cv2.predict(original_X_test)
lr2.fit(original_X_train, original_y_train)
predict2 = lr2.predict(original_X_test)
print('AUC:{:.3f} Recall:{:.3f} Precision:{:.3f}'.format(
metrics.roc_auc_score(original_y_test, predict2),
metrics.recall_score(original_y_test, predict2),
metrics.precision_score(original_y_test, predict2)
))
AUC:0.912 Recall:0.824 Precision:0.819
import itertools
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
# plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
y_train_pre = lr2.predict(original_X_train)
cnf_matrix_train = confusion_matrix(original_y_train, y_train_pre)
print("Training set label samples:\nLabel {}:{}, Label {}:{}个".format(train_unique_label[0],train_label_count[0], train_unique_label[1],train_label_count[1] ))
print("Training set Recall: {:.2f}%".format(100*cnf_matrix_train[1,1]/(cnf_matrix_train[1,0]+cnf_matrix_train[1,1])))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_train , classes=class_names, title='Confusion matrix')
plt.show()
Training set label samples: Label 0:199020, Label 1:344个 Training set Recall: 78.78%
y_pre = lr2.predict(original_X_test)
cnf_matrix_test = confusion_matrix(original_y_test, y_pre)
print("Test set label samples:\nLabel {}:{}, Label {}:{}".format(test_unique_label[0],test_label_count[0], test_unique_label[1],test_label_count[1] ))
print("Test set Recall: {:.2f}%".format(100*cnf_matrix_test[1,1]/(cnf_matrix_test[1,0]+cnf_matrix_test[1,1])))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_test , classes=class_names, title='Confusion matrix')
plt.show()
Test set label samples: Label 0:85295, Label 1:148 Test set Recall: 82.43%
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
tree_params = {
"criterion": ["gini", "entropy"],
"max_depth": list(range(2, 4, 1)),
"min_samples_leaf": list(range(5, 7, 1))
}
dt_cv = GridSearchCV(DecisionTreeClassifier(), tree_params, scoring='f1')
dt_cv.fit(original_X_train, original_y_train.ravel())
GridSearchCV(estimator=DecisionTreeClassifier(), param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 3], 'min_samples_leaf': [5, 6]}, scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(estimator=DecisionTreeClassifier(), param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 3], 'min_samples_leaf': [5, 6]}, scoring='f1')
DecisionTreeClassifier()
DecisionTreeClassifier()
print('Best parameters:',dt_cv.best_params_)
Best parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 5}
predict3 = dt_cv.predict(original_X_test)
print('AUC:{:.3f} Recall:{:.3f} Precision:{:.3f}'.format(
metrics.roc_auc_score(original_y_test, predict3),
metrics.recall_score(original_y_test, predict3),
metrics.precision_score(original_y_test, predict3)
))
AUC:0.909 Recall:0.818 Precision:0.818
dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
dt_clf.fit(original_X_train, original_y_train.ravel())
DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
import pydotplus
from IPython.display import display, Image
dot_data = export_graphviz(dt_clf,
out_file=None,
feature_names=X.columns,
class_names = ['normal', 'fraud'],
filled = True,
rounded =True
)
graph = pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))
fpr, tpr, thresholds = roc_curve(original_y_test, predict2)
roc_auc = auc(fpr, tpr)
print('Logistic Regression AUC:{:.2f}%'.format(100*roc_auc))
dt_fpr, dt_tpr, dt_thresholds = roc_curve(original_y_test, predict3)
dt_roc_auc = auc(dt_fpr, dt_tpr)
print('Decision Tree AUC:{:.2f}%'.format(100*dt_roc_auc))
# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='Logistic Regression AUC = %0.3f'% roc_auc)
plt.plot(dt_fpr, dt_tpr, 'y', label='Decision Tress AUC = %0.3f'% dt_roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Logistic Regression AUC:91.20% Decision Tree AUC:90.86%