当前位置：网站首页>Fraud detection cases and Titanic rescued cases

Fraud detection cases and Titanic rescued cases

2022-07-24 14:20:00 【Strong fight】

Fraud detection cases （ The sample is unbalanced , Standardization , Cross validation , Model to evaluate ）

# Draw a category scale 
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

# Standardized operation 
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))  # -1  The system automatically calculates the number of rows 
data =data.drop("Time", "Amount", axis=1)

# Down sampling strategy 
X=data.ix[:, data.columns != 'Class']
y=data.ix[:, data.columns == 'Class']
# Get category 1 The number of 
number_records_fraud = len(data[data.Class==1])
fraud_indices = np.array(data[data.Class==1].index)
normal_indices = data[data.Class == 0].index
# From category as 0 In the index group of   Random selection   And category is 1 The same number   Take it out 
random_normal_indices = np.random.choice(normal_indices, number_records_fraud,replace=False)
random_normal_indices = np.array(random_normal_indices)
# Mix two categories of indexes 
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.ix[:, under_sample_data !='Class']
y_undersample = under_sample_data.ix[:, under_sample_data =='Class']

# Call the tool to segment the training set and the test set 
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

from sklearn.linear_model imprt LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report

Confusion matrix

# threshold The value can be specified by yourself , The larger the value, the stricter 
lr=LogisticRegression(C=0.01, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)  # Obtain the category probability value given by the model 
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt.figure(figsize=(10,10))
j=1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1]>i     # Only when the probability value is greater than the threshold can it be determined as a certain category 
    plt.subplot(3,3,j)
    j+=1
    cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    class_names =[0,1]
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s'%i)
plt.show()

Oversampling strategy ：

#SMOTE Algorithm   Training set generation 
import imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=0)
os_features, os_labels=oversampler.fit_sample(features_train, labels_train)

The Titanic was rescued （ Missing value fill , Numeric character mapping , The extracted features , Algorithm integration ）
Call the linear regression algorithm

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
alg = LinearRegression()
kf = KFold(titaniic.shape[0], n_folds=3, random_state=1)
predictors=[]
for train, test in kf:
    train_predictors = (titanic[predictors].iloc[train, :])
    train_target = titanic["Survived"].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
    
import numpy as np
predictions = np.concatenate(predictions, axis=0)
predictions[predictions >.5]=1
predictions[predictions<=.5]=0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)

Call the logistic regression algorithm Try

from sklearn import cross_validation
from sklearn linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()

Call the random forest algorithm Try

from sklearn.ensemble import RandomForestClassifier
alg=RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"],cv=kf)

Adjust the parameters

alg=RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)

Extracting features

titanic["Familysize"]=titanic[""]+titanic[""]
titanic["NameLength"]=titanic["Name"].apply(lambda x:len(x))
import re
def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
title_mapping={"Mr":1, "Miss":2, "Mrs":3, }
for k,v in title_mapping.items():
    title[titles == k] = v
titanic["Title"]=titles

Verify the importance of each feature

import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors=["", "", ""]
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors],titanic["Survived"])
scores = -np.log10(selector.pvalues)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

Algorithm integration

import numpy as np
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25,max_depth=3),["Pclass","Sex"]],
[LogisticRegression(random_state=1), ["Pclass","Sex"]]
]
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train,test in kf:
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    for alg, predictors in alograms:
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
    test_predictions[test_predictions <=.5]=0
    test_predictions[test_predictions > .5]=1

原网站

版权声明
本文为[Strong fight]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/203/202207211649265383.html

当前位置：网站首页>Fraud detection cases and Titanic rescued cases

Fraud detection cases and Titanic rescued cases

边栏推荐

猜你喜欢

随机推荐