import numpy as np
import pandas as pd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


data = pd.read_csv('./creditcardfraud.csv')
data.shape

(284807, 31)


data.head()


data.isnull().values.sum()

0


transaction_class = data.Class.value_counts().reset_index()
transaction_class


transaction_class.rename(columns={'index':'Transaction Type', 'Class':'Frequency'}, inplace=True)
transaction_class['Transaction Type'] = transaction_class['Transaction Type'].map({0:'Normal',1:'Fraud'})
transaction_class['Percentage'] = transaction_class['Frequency']/transaction_class['Frequency'].sum()
transaction_class


transaction_class_fig = px.bar(transaction_class, y='Transaction Type', x='Frequency', height=300, width=800,
                              text=['Frequency:{}<br>Percentage:{:.2f}%'.format(a,100*b) for a,b in zip(transaction_class['Frequency'],transaction_class['Percentage'])],
                              orientation='h'
                              )

transaction_class_fig.update_layout(title={'text':'Transaction Types', 'y':0.97, 'x':0.5, 'xanchor':'center', 'yanchor':'top'})


def draw_hist(dataframe, x_name, x_rename, title, color, nbins_count = 100, height = 400, width = 900, log_y=False):
    fig = px.histogram(dataframe, x=x_name, nbins=nbins_count, labels={x_name:x_rename}, 
                    opacity=0.9, color_discrete_sequence=[color], # color of histogram bars
                    height= height, width = width, log_y = log_y,
                    # range_x = [dataframe[x_name].min(),dataframe[x_name].max()],
                    # marginal='box',
    )
    fig.update_layout(title={'text':title, 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
    fig.show()


normal_transaction = data[data['Class'] == 0]
fraud_transaction = data[data['Class'] == 1]

draw_hist(normal_transaction, 'Time', 'Transaction Time', 'Normal Transaction Time Distribution', 'rgb(156,219,165)')
draw_hist(fraud_transaction, 'Time', 'Transaction Time', 'Fraud Transaction Time Distribution', 'rgb(214,96,77)')


draw_hist(normal_transaction, 'Time', 'Transaction Time', 'Normal Transaction Time Distribution', 'rgb(156,219,165)', log_y=True)
draw_hist(fraud_transaction, 'Time', 'Transaction Time', 'Fraud Transaction Time Distribution', 'rgb(214,96,77)', log_y=True)


normal_amount_box = px.box(normal_transaction, x='Amount', width = 700, height = 200)
normal_amount_box.update_layout(title={'text':'Normal Transaction Amount Distribution', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
normal_amount_box.show()


fraud_amount_box = px.box(fraud_transaction, x='Amount', width = 700, height = 200)
fraud_amount_box.update_layout(title={'text':'Fraud Transaction Amount Distribution', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
fraud_amount_box.show()


draw_hist(normal_transaction, 'Amount', 'Transaction Amount', 'Normal Transaction Amount Distribution', 'rgb(156,219,165)', nbins_count=30, height = 300, width = 700)
draw_hist(fraud_transaction, 'Amount', 'Transaction Amount', 'Fraud Transaction Amount Distribution', 'rgb(214,96,77)', nbins_count=30, height = 300, width = 700)


normal_time_amount_fig = px.scatter(normal_transaction, x='Time', y = 'Amount', height = 380, width = 700)
normal_time_amount_fig.update_layout(title={'text':'Normal Transaction Time vs Amount', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
normal_time_amount_fig.show()


fraud_time_amount_fig = px.scatter(fraud_transaction, x='Time', y = 'Amount', height = 380, width = 700)
fraud_time_amount_fig.update_layout(title={'text':'Fraud Transaction Time vs Amount', 'y':0.97, 'x':0.5, 'xanchor':'center','yanchor':'top'})
fraud_time_amount_fig.show()


from sklearn.preprocessing import StandardScaler, RobustScaler

std_scaler = StandardScaler()

data['stdScaledAmount'] = std_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['stdScaledTime'] = std_scaler.fit_transform(data['Time'].values.reshape(-1,1))

print('Original Amount：',data['Amount'].values.reshape(-1,1))
print('Std Scaled Amount：',std_scaler.fit_transform(data['Amount'].values.reshape(-1,1)))
print('Original Time：',data['Time'].values.reshape(-1,1))
print('Std Scaled Time：',std_scaler.fit_transform(data['Time'].values.reshape(-1,1)))

Original Amount： [[149.62]
 [  2.69]
 [378.66]
 ...
 [ 67.88]
 [ 10.  ]
 [217.  ]]
Std Scaled Amount： [[ 0.24496426]
 [-0.34247454]
 [ 1.16068593]
 ...
 [-0.0818393 ]
 [-0.31324853]
 [ 0.51435531]]
Original Time： [[0.00000e+00]
 [0.00000e+00]
 [1.00000e+00]
 ...
 [1.72788e+05]
 [1.72788e+05]
 [1.72792e+05]]
Std Scaled Time： [[-1.99658302]
 [-1.99658302]
 [-1.99656197]
 ...
 [ 1.6419735 ]
 [ 1.6419735 ]
 [ 1.64205773]]


rub_scaler = RobustScaler() 

data['rubScaledAmount'] = rub_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['rubScaledTime'] = rub_scaler.fit_transform(data['Time'].values.reshape(-1,1))

print('Original Amount：',data['Amount'].values.reshape(-1,1))
print('Rub Scaled Amount：',rub_scaler.fit_transform(data['Amount'].values.reshape(-1,1)))
print('Original Time：',data['Time'].values.reshape(-1,1))
print('Rub Scaled Time：',rub_scaler.fit_transform(data['Time'].values.reshape(-1,1)))

Original Amount： [[149.62]
 [  2.69]
 [378.66]
 ...
 [ 67.88]
 [ 10.  ]
 [217.  ]]
Rub Scaled Amount： [[ 1.78327395]
 [-0.26982463]
 [ 4.98372109]
 ...
 [ 0.64109551]
 [-0.16767973]
 [ 2.72479564]]
Original Time： [[0.00000e+00]
 [0.00000e+00]
 [1.00000e+00]
 ...
 [1.72788e+05]
 [1.72788e+05]
 [1.72792e+05]]
Rub Scaled Time： [[-0.99498349]
 [-0.99498349]
 [-0.99497175]
 ...
 [ 1.03497457]
 [ 1.03497457]
 [ 1.03502156]]


new_data = data.drop(['Time','Amount'], axis = 1)
new_data.head()


from sklearn.model_selection import train_test_split       
from sklearn.model_selection import StratifiedShuffleSplit 

X = new_data.iloc[:, new_data.columns != 'Class']
y = new_data.iloc[:, new_data.columns == 'Class']
print('X shape:',X.shape, '\ny shape:',y.shape)

X shape: (284807, 32) 
y shape: (284807, 1)


sss = StratifiedShuffleSplit(n_splits=10,test_size=0.3,train_size=None, random_state=80)

# Alternative: Random sampling
# X = np.array(new_data.iloc[:, new_data.columns != 'Class']) 
# y = np.array(new_data.iloc[:, new_data.columns == 'Class']) 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=80)

for train_index, test_index in sss.split(X,y):
    print('Train:', train_index, 'Test:', test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

original_X_train = np.array(original_Xtrain)
original_X_test = np.array(original_Xtest)
original_y_train = np.array(original_ytrain)
original_y_test = np.array(original_ytest)

Train: [277279 271167 243548 ... 230480 200350 106351] Test: [234266 275718 133564 ... 244125  28889 214407]
Train: [106342  32307 134295 ...  16289  55959 113257] Test: [145064 105889 141570 ... 113839  57961  74335]
Train: [ 62862  48218 280526 ...  73828   8090 147350] Test: [142773 273368  15299 ... 246861 187187 168339]
Train: [ 89779 111045  68471 ... 119218 186855 113718] Test: [138949 230823  37236 ...  27393  14353 240755]
Train: [250963  29495  63061 ...  12292 250661 270470] Test: [176901  81122  97173 ... 171813  50060 251697]
Train: [184435 266521  53453 ... 274760 107306   3101] Test: [165220 227906   6723 ... 267331  14144   5792]
Train: [ 17871  69032 190049 ...  68457  86789 137289] Test: [ 27873  32738  32516 ... 180288 266560 240278]
Train: [254640 199341 142105 ...  12670 101875 198010] Test: [271978  26865 233939 ... 192333  53941  79180]
Train: [128590 147863 195521 ... 207013 191362 282945] Test: [207007 253186   5845 ... 250124  12579 208115]
Train: [273617 233366 205416 ...  66225 263933  83687] Test: [174185  95912 279859 ... 275455  57898 201168]


train_unique_label, train_label_count = np.unique(original_y_train, return_counts=True)
test_unique_label, test_label_count = np.unique(original_y_test, return_counts=True)

train_label_rate = train_label_count / len(original_ytrain)
test_label_rate = test_label_count / len(original_ytest)


print("\nTrain Set Label {}:{}, Label {}:{}".format(train_unique_label[0],train_label_count[0], train_unique_label[1],train_label_count[1] ))
print("\nTrain Set Label Distribution:")
for label,rate in zip(train_unique_label, train_label_rate):
    print('Label {} Percentage:{:.2f}%'.format(label,100*rate))

print("\nTest Set Label {}:{}个, Label {}:{}个".format(test_unique_label[0],test_label_count[0], test_unique_label[1],test_label_count[1] ))
print("\nTest Set Label Distribution：")
for label,rate in zip(test_unique_label,test_label_rate):
    print('Label {} Percentage:{:.2f}%'.format(label,100*rate))

Train Set Label 0:199020, Label 1:344

Train Set Label Distribution:
Label 0 Percentage:99.83%
Label 1 Percentage:0.17%

Test Set Label 0:85295个, Label 1:148个

Test Set Label Distribution：
Label 0 Percentage:99.83%
Label 1 Percentage:0.17%


# scikit-learn needs to be 1.2.2 (https://github.com/scikit-learn-contrib/imbalanced-learn/issues/995)
# pip install imbalanced-learn

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek


ros = RandomOverSampler(random_state = 0)
sos = SMOTE(random_state = 0)
kos = SMOTETomek(random_state = 0)

X_ros, y_ros = ros.fit_sample(original_X_train, original_y_train)
X_sos, y_sos = sos.fit_sample(original_X_train, original_y_train)
X_kos, y_kos = kos.fit_sample(original_X_train, original_y_train)

print('ros:{}, sos:{}, kos:{}'.format(len(y_ros), len(y_sos), len(y_kos)))

ros:398040, sos:398040, kos:398040


print('Training set fraud sample, original training set:{} \nTraining set after RandomOverSampler:{}, Training set after SMOTE:{}, Training set after SMOTETomek:{}\
      '.format(original_y_train.sum(), y_ros.sum(), y_sos.sum(), y_kos.sum()))
print('Fraud percentage in the original training set {:.2f}%, After oversampling fraud sample percentage increases to：{:.0f}%'.format(100*original_y_train.sum()/len(original_y_train), 100*y_ros.sum()/len(y_ros)))

Training set fraud sample, original training set:344 
Training set after RandomOverSampler:199020, Training set after SMOTE:199020, Training set after SMOTETomek:199020      
Fraud percentage in the original training set 0.17%, After oversampling fraud sample percentage increases to：50%


from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_curve, auc, recall_score, classification_report


lr = LogisticRegression(max_iter=500, C=100)

#paramaters = {'C':[0.01,0.1,1,5,10,100]} 
# 10 folds, n jobs run in parallel
#lr_cv1 = GridSearchCV(lr, param_grid = paramaters, cv=10, n_jobs=-1, verbose=5, scoring='f1')  # n_jobs=-1


data = [[original_X_train, original_y_train],
        [X_ros, y_ros],
        [X_sos, y_sos],
        [X_kos, y_kos]]


for features, labels in data:
    lr.fit(features, labels)

    #lr_cv1.fit(features, labels)
    predict_test = lr.predict(original_X_test)
    print('AUC:{} Recall:{} Precision:{}'.format(
        metrics.roc_auc_score(original_y_test, predict_test),
        metrics.recall_score(original_y_test, predict_test),
        metrics.precision_score(original_y_test, predict_test)
    ))

AUC:0.8310341850144887 Recall:0.6621621621621622 Precision:0.9245283018867925
AUC:0.9499121094833036 Recall:0.9256756756756757 Precision:0.058497011101622545
AUC:0.9493317706592225 Recall:0.9256756756756757 Precision:0.05612453912331012
AUC:0.9493317706592225 Recall:0.9256756756756757 Precision:0.05612453912331012


lr2 = LogisticRegression(max_iter = 500, C=100, class_weight={0:1,1:5})
#param_grid= {'C':[0.01,0.1,1,5,10,100], 
#            'class_weight':[{0:1,1:3}, {0:1,1:5},{0:1,1:10}, {0:1,1:15}]
#            } 

#lr_cv2 = GridSearchCV(lr, param_grid = param_grid, cv=10, n_jobs=-1, verbose=5, scoring='f1')  # n_jobs=-1

# 
#lr_cv2.fit(original_X_train, original_y_train)
#predict2 = lr_cv2.predict(original_X_test)
lr2.fit(original_X_train, original_y_train)
predict2 = lr2.predict(original_X_test)

print('AUC:{:.3f} Recall:{:.3f} Precision:{:.3f}'.format(
        metrics.roc_auc_score(original_y_test, predict2),
        metrics.recall_score(original_y_test, predict2),
        metrics.precision_score(original_y_test, predict2)
    ))

AUC:0.912 Recall:0.824 Precision:0.819


import itertools

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    # plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


y_train_pre = lr2.predict(original_X_train)

cnf_matrix_train = confusion_matrix(original_y_train, y_train_pre)

print("Training set label samples:\nLabel {}:{}, Label {}:{}个".format(train_unique_label[0],train_label_count[0], train_unique_label[1],train_label_count[1] ))
print("Training set Recall: {:.2f}%".format(100*cnf_matrix_train[1,1]/(cnf_matrix_train[1,0]+cnf_matrix_train[1,1])))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_train , classes=class_names, title='Confusion matrix')
plt.show()

Training set label samples:
Label 0:199020, Label 1:344个
Training set Recall: 78.78%


y_pre = lr2.predict(original_X_test)

cnf_matrix_test = confusion_matrix(original_y_test, y_pre)

print("Test set label samples：\nLabel {}:{}, Label {}:{}".format(test_unique_label[0],test_label_count[0], test_unique_label[1],test_label_count[1] ))
print("Test set Recall: {:.2f}%".format(100*cnf_matrix_test[1,1]/(cnf_matrix_test[1,0]+cnf_matrix_test[1,1])))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_test , classes=class_names, title='Confusion matrix')
plt.show()

Test set label samples：
Label 0:85295, Label 1:148
Test set Recall: 82.43%


from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz


tree_params = {
    "criterion": ["gini", "entropy"], 
    "max_depth": list(range(2, 4, 1)), 
    "min_samples_leaf": list(range(5, 7, 1)) 
     }
dt_cv = GridSearchCV(DecisionTreeClassifier(), tree_params, scoring='f1')
dt_cv.fit(original_X_train, original_y_train.ravel())

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 3],
                         'min_samples_leaf': [5, 6]},
             scoring='f1')

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 3],
                         'min_samples_leaf': [5, 6]},
             scoring='f1')

DecisionTreeClassifier()

DecisionTreeClassifier()


print('Best parameters：',dt_cv.best_params_)

Best parameters： {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 5}


predict3 = dt_cv.predict(original_X_test)
print('AUC:{:.3f} Recall:{:.3f} Precision:{:.3f}'.format(
        metrics.roc_auc_score(original_y_test, predict3),
        metrics.recall_score(original_y_test, predict3),
        metrics.precision_score(original_y_test, predict3)
    ))

AUC:0.909 Recall:0.818 Precision:0.818


dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) 
dt_clf.fit(original_X_train, original_y_train.ravel())

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)


import pydotplus
from IPython.display import display, Image

dot_data = export_graphviz(dt_clf, 
                            out_file=None, 
                            feature_names=X.columns, 
                            class_names = ['normal', 'fraud'],
                            filled = True, 
                            rounded =True  
                        )
                               
graph = pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))


fpr, tpr, thresholds = roc_curve(original_y_test, predict2)
roc_auc = auc(fpr, tpr)
print('Logistic Regression AUC：{:.2f}%'.format(100*roc_auc))


dt_fpr, dt_tpr, dt_thresholds = roc_curve(original_y_test, predict3)
dt_roc_auc = auc(dt_fpr, dt_tpr)
print('Decision Tree AUC：{:.2f}%'.format(100*dt_roc_auc))

# Plot ROC

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='Logistic Regression AUC = %0.3f'% roc_auc)
plt.plot(dt_fpr, dt_tpr, 'y', label='Decision Tress AUC = %0.3f'% dt_roc_auc)
plt.legend(loc='lower right') 
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Logistic Regression AUC：91.20%
Decision Tree AUC：90.86%

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	...	V24	V25	V26	V27	V28	stdScaledAmount	stdScaledTime	rubScaledAmount	rubScaledTime
0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	...	0.066928	0.128539	-0.189115	0.133558	-0.021053	0.244964	-1.996583	1.783274	-0.994983
1	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	...	-0.339846	0.167170	0.125895	-0.008983	0.014724	-0.342475	-1.996583	-0.269825	-0.994983
2	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	...	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	1.160686	-1.996562	4.983721	-0.994972
3	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	...	-1.175575	0.647376	-0.221929	0.062723	0.061458	0.140534	-1.996562	1.418291	-0.994972
4	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	...	0.141267	-0.206010	0.502292	0.219422	0.215153	-0.073403	-1.996541	0.670579	-0.994960

Background¶

Exploratory data analysis¶

Transaction types¶

Transaction time¶

Transaction amount¶

Transaction time vs amount¶

Data preprocessing¶

Data standardazation¶

Sample balance¶

Train and test set split¶

Balance samples on the training set¶

Prediction models¶

Logistic regression¶

Confusion matrix¶

Decision tree¶

Visualize decision tree¶

Model performance comparison¶

	index	Class
0	0	284315
1	1	492

	Transaction Type	Frequency	Percentage
0	Normal	284315	0.998273
1	Fraud	492	0.001727