Modelo antifraude¶

Desarrollo de un modelo que permite, a partir de los datos en el archivo train.csv predecir cuál será el valor de la variable FRAUDE para una transacción cualquiera. Donde la variable FRAUDE es 1 si la transacción constituyó un fraude o 0 si fue una transacción legítima¶

In [780]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
In [781]:
data= pd.read_csv("train.csv")
data_test= pd.read_csv("test.csv")
In [782]:
#data["FECHA"]
In [783]:
data_test["Dist_max_INTER"]#.columns
data_test.columns.values.shape

##.dtypes
data_test.columns.values#.columns
Out[783]:
array(['id', 'FRAUDE', 'VALOR', 'HORA_AUX', 'Dist_max_COL',
       'Dist_max_INTER', 'Canal1', 'FECHA_FRAUDE', 'COD_PAIS', 'CANAL',
       'FECHA', 'DIASEM', 'DIAMES', 'FECHA_VIN', 'OFICINA_VIN', 'SEXO',
       'SEGMENTO', 'EDAD', 'INGRESOS', 'EGRESOS', 'NROPAISES',
       'Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',
       'NROCIUDADES', 'Dist_Sum_NAL', 'Dist_Mean_NAL', 'Dist_HOY',
       'Dist_sum_NAL', 'Dist_mean_NAL', 'Dist_sum_INTER',
       'Dist_mean_INTER'], dtype=object)
In [784]:
#data[data["id"]==98523068]
data.columns.values 
Out[784]:
array(['id', 'FRAUDE', 'VALOR', 'HORA_AUX', 'Dist_max_NAL', 'Canal1',
       'FECHA', 'COD_PAIS', 'CANAL', 'DIASEM', 'DIAMES', 'FECHA_VIN',
       'OFICINA_VIN', 'SEXO', 'SEGMENTO', 'EDAD', 'INGRESOS', 'EGRESOS',
       'NROPAISES', 'Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',
       'NROCIUDADES', 'Dist_Mean_NAL', 'Dist_HOY', 'Dist_sum_NAL'],
      dtype=object)
In [785]:
# Hay desbalanceo de datos, se puede aplicar el metodo de sobremuestreo con SMOTE
data["FRAUDE"].value_counts()
Out[785]:
0    2234
1     731
Name: FRAUDE, dtype: int64
In [786]:
731/(2234+731)
Out[786]:
0.24654300168634063
In [787]:
# Obtener la intersección de columnas
common_columns = np.intersect1d(data_test.columns.values,data.columns.values  )

#print("Columnas comunes en ambos conjuntos de datos:")
#print(common_columns)

# Obtener la intersección de columnas
common_columns = np.intersect1d(data.columns.values, data_test.columns.values)

# Ordenar las columnas comunes en el mismo orden que el primer array
common_columns_sorted = data_test.columns.values[np.isin(data_test.columns.values, common_columns)]

print("Columnas comunes en ambos conjuntos de datos en el mismo orden que el primer array:")
print(common_columns_sorted)
Columnas comunes en ambos conjuntos de datos en el mismo orden que el primer array:
['id' 'FRAUDE' 'VALOR' 'HORA_AUX' 'Canal1' 'COD_PAIS' 'CANAL' 'FECHA'
 'DIASEM' 'DIAMES' 'FECHA_VIN' 'OFICINA_VIN' 'SEXO' 'SEGMENTO' 'EDAD'
 'INGRESOS' 'EGRESOS' 'NROPAISES' 'Dist_Sum_INTER' 'Dist_Mean_INTER'
 'Dist_Max_INTER' 'NROCIUDADES' 'Dist_Mean_NAL' 'Dist_HOY' 'Dist_sum_NAL']
In [788]:
#data.drop([Dist_Sum_INTER,Dist_Mean_INTER ])
# Seleccionar solo las columnas comunes
data_train = data[common_columns_sorted]
In [789]:
data_train.isnull().sum()
Out[789]:
id                    0
FRAUDE                0
VALOR                 0
HORA_AUX              0
Canal1                0
COD_PAIS              0
CANAL                 0
FECHA                 0
DIASEM                0
DIAMES                0
FECHA_VIN            24
OFICINA_VIN          24
SEXO                 55
SEGMENTO             24
EDAD                 24
INGRESOS             24
EGRESOS              24
NROPAISES             0
Dist_Sum_INTER     1547
Dist_Mean_INTER    1547
Dist_Max_INTER     1547
NROCIUDADES           0
Dist_Mean_NAL       457
Dist_HOY              0
Dist_sum_NAL          0
dtype: int64
In [790]:
data_train
Out[790]:
id FRAUDE VALOR HORA_AUX Canal1 COD_PAIS CANAL FECHA DIASEM DIAMES ... INGRESOS EGRESOS NROPAISES Dist_Sum_INTER Dist_Mean_INTER Dist_Max_INTER NROCIUDADES Dist_Mean_NAL Dist_HOY Dist_sum_NAL
0 9000000001 1 0.00 13 ATM_INT US ATM_INT 20150501 5 1 ... 1200000.0 1200000.0 1 NaN NaN NaN 6 474.94 4552.41 5224.36
1 9000000002 1 0.00 17 ATM_INT US ATM_INT 20150515 5 15 ... 5643700.0 500000.0 1 NaN NaN NaN 5 289.99 4552.41 2029.90
2 9000000003 1 0.00 13 ATM_INT US ATM_INT 20150501 5 1 ... 1200000.0 1200000.0 1 NaN NaN NaN 6 474.94 4552.41 5224.36
3 9000000004 1 0.00 13 ATM_INT US ATM_INT 20150501 5 1 ... 1200000.0 1200000.0 1 NaN NaN NaN 6 474.94 4552.41 5224.36
4 9000000005 1 0.00 0 ATM_INT CR ATM_INT 20150510 0 10 ... 0.0 0.0 1 NaN NaN NaN 1 NaN 1482.35 1.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2960 622529101 1 993430.04 19 POS US POS 20150519 2 19 ... 103918285.0 95475378.0 4 8944.83 2236.21 3646.67 4 96.86 4552.41 484.30
2961 2043206272 0 9957.05 10 POS US POS 20150524 0 24 ... 23625000.0 5000000.0 3 27648.32 3949.76 4552.41 11 82.67 4552.41 2810.75
2962 2943206272 0 9957.05 10 POS US POS 20150524 0 24 ... 23625000.0 5000000.0 3 27648.32 3949.76 4552.41 11 82.67 4552.41 2810.75
2963 3136302872 0 996191.64 15 POS US MCI 20150513 3 13 ... 56666000.0 37600750.0 1 NaN NaN NaN 3 219.46 4552.41 1316.79
2964 1953178702 1 999276.60 16 ATM_INT CR ATM_INT 20150520 3 20 ... 12853000.0 6156000.0 1 NaN NaN NaN 1 NaN 1482.35 1.00

2965 rows × 25 columns

In [791]:
data_train.isnull().sum()
data_train["SEGMENTO"].value_counts()
Out[791]:
Personal Plus    1527
Preferencial      958
Personal          174
Emprendedor       159
PYME              119
Empresarial         4
Name: SEGMENTO, dtype: int64
In [792]:
# Contar el número de transacciones con y sin fraude
fraude_counts = data['FRAUDE'].value_counts()

# Crear un gráfico de barras
plt.figure(figsize=(8, 6))
fraude_counts.plot(kind='bar', color=['blue', 'red'])
plt.title('Número de Transacciones con y sin Fraude')
plt.xlabel('Fraude')
plt.ylabel('Número de Transacciones')
plt.xticks(ticks=[0, 1], labels=['Sin Fraude', 'Con Fraude'], rotation=0)
plt.show()
In [793]:
# Contar el número de transacciones por día de la semana (DIASEM)
transactions_by_day = data_train.groupby('DIASEM')['id'].count()

# Contar el número de transacciones fraudulentas por día de la semana (DIASEM)
fraud_transactions_by_day = data_train[data_train['FRAUDE'] == 1].groupby('DIASEM')['id'].count()

# Contar el número de transacciones por día del mes (DIAMES)
transactions_by_date = data_train.groupby('DIAMES')['id'].count()

# Contar el número de transacciones fraudulentas por día del mes (DIAMES)
fraud_transactions_by_date = data_train[data_train['FRAUDE'] == 1].groupby('DIAMES')['id'].count()

# Crear figura y ejes
fig, axs = plt.subplots(2, 1, figsize=(10, 10))

# Gráfico de barras apiladas para transacciones por día de la semana

#transactions_by_day.plot(kind='bar', color='blue', ax=axs[0], label='Transacciones')
fraud_transactions_by_day.plot(kind='bar', color='red', ax=axs[0], label='Fraudulentas')
axs[0].set_title('Transacciones por Día de la Semana')
axs[0].set_xlabel('Día de la Semana')
axs[0].set_ylabel('Número de Transacciones')
axs[0].legend()

# Gráfico de barras apiladas para transacciones por día del mes
fraud_transactions_by_date.plot(kind='bar', color='red', ax=axs[1], label='Fraudulentas')
axs[1].set_title('Transacciones por Día del Mes')
axs[1].set_xlabel('Día del Mes')
axs[1].set_ylabel('Número de Transacciones')
axs[1].legend()

plt.tight_layout()
plt.show()
In [794]:
# Descartar las columnas mencionadas en una sola línea con inplace=True
data_train.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
                         ,"FECHA_VIN","OFICINA_VIN","CANAL","FECHA"], inplace=True)
/tmp/ipykernel_147672/3244588327.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
In [795]:
#print(data_train["CANAL"].value_counts())
print(data_train["Canal1"].value_counts())
data_train.isnull().sum()
POS        2329
ATM_INT     636
Name: Canal1, dtype: int64
Out[795]:
FRAUDE             0
VALOR              0
HORA_AUX           0
Canal1             0
DIASEM             0
DIAMES             0
SEXO              55
SEGMENTO          24
EDAD              24
INGRESOS          24
EGRESOS           24
NROPAISES          0
NROCIUDADES        0
Dist_Mean_NAL    457
Dist_HOY           0
Dist_sum_NAL       0
dtype: int64
In [796]:
# Codificación one-hot para las variables categóricas
data_encoded = pd.get_dummies(data_train, columns=['Canal1', 'SEXO', 'SEGMENTO'])

# Ver las primeras filas del conjunto de datos codificado
In [797]:
#Datos que se utilizaran para el modelo
data_encoded
Out[797]:
FRAUDE VALOR HORA_AUX DIASEM DIAMES EDAD INGRESOS EGRESOS NROPAISES NROCIUDADES ... Canal1_ATM_INT Canal1_POS SEXO_F SEXO_M SEGMENTO_Emprendedor SEGMENTO_Empresarial SEGMENTO_PYME SEGMENTO_Personal SEGMENTO_Personal Plus SEGMENTO_Preferencial
0 1 0.00 13 5 1 29.0 1200000.0 1200000.0 1 6 ... 1 0 0 1 0 0 0 0 1 0
1 1 0.00 17 5 15 29.0 5643700.0 500000.0 1 5 ... 1 0 0 1 0 0 0 0 1 0
2 1 0.00 13 5 1 29.0 1200000.0 1200000.0 1 6 ... 1 0 0 1 0 0 0 0 1 0
3 1 0.00 13 5 1 29.0 1200000.0 1200000.0 1 6 ... 1 0 0 1 0 0 0 0 1 0
4 1 0.00 0 0 10 25.0 0.0 0.0 1 1 ... 1 0 0 1 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2960 1 993430.04 19 2 19 48.0 103918285.0 95475378.0 4 4 ... 0 1 1 0 0 0 0 0 0 1
2961 0 9957.05 10 0 24 35.0 23625000.0 5000000.0 3 11 ... 0 1 1 0 0 0 0 0 0 1
2962 0 9957.05 10 0 24 35.0 23625000.0 5000000.0 3 11 ... 0 1 1 0 0 0 0 0 0 1
2963 0 996191.64 15 3 13 34.0 56666000.0 37600750.0 1 3 ... 0 1 1 0 0 0 1 0 0 0
2964 1 999276.60 16 3 20 29.0 12853000.0 6156000.0 1 1 ... 1 0 1 0 0 0 0 0 1 0

2965 rows × 23 columns

In [ ]:
 
In [798]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
In [799]:
# Realizar selección de características utilizando la importancia de características
#X = data_encoded.drop(columns=['FRAUDE'])
#y = data_encoded['FRAUDE']
#clf = RandomForestClassifier(n_estimators=100, random_state=42)
#clf.fit(X, y)
#feature_selector = SelectFromModel(clf, prefit=True)
#X_selected = feature_selector.transform(X)
In [800]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
# Realizar selección de características utilizando la importancia de características
X = data_encoded.drop(columns=['FRAUDE'])
y = data_encoded['FRAUDE']
# Imputar valores faltantes
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Realizar selección de características utilizando la importancia de características


lf = RandomForestClassifier(n_estimators=100, random_state=42)
lf.fit(X_imputed, y)
feature_selector = SelectFromModel(lf, prefit=True)
X_selected = feature_selector.transform(X_imputed)
In [801]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Aplicar SMOTE al conjunto de entrenamiento (opcional)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
In [ ]:
 
In [802]:
# Entrenar el modelo de detección de fraude
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluar el modelo en el conjunto de prueba
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       456
           1       0.92      0.91      0.91       137

    accuracy                           0.96       593
   macro avg       0.95      0.94      0.94       593
weighted avg       0.96      0.96      0.96       593

In [803]:
X_train.shape
X_test.shape
X_train_smote.shape
Out[803]:
(3556, 10)
In [804]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
# Modelos de clasificación
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

# Entrenamiento y evaluación de modelos
for name, classifier in classifiers.items():
    print(f"Entrenando modelo {name}")
    if name == "XGBoost" or name == "LightGBM":
        classifier.fit(X_train_smote, y_train_smote, eval_metric='logloss')
    else:
        classifier.fit(X_train_smote, y_train_smote)
    
    # Predicción y evaluación en conjunto de prueba
    y_pred = classifier.predict(X_test)
    print(f"Resultados para {name}:")
    print(classification_report(y_test, y_pred))
    print("\n")
Entrenando modelo Random Forest
Resultados para Random Forest:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       456
           1       0.86      0.90      0.88       137

    accuracy                           0.94       593
   macro avg       0.91      0.93      0.92       593
weighted avg       0.94      0.94      0.94       593



Entrenando modelo Logistic Regression
Resultados para Logistic Regression:
              precision    recall  f1-score   support

           0       0.87      0.61      0.72       456
           1       0.35      0.69      0.46       137

    accuracy                           0.63       593
   macro avg       0.61      0.65      0.59       593
weighted avg       0.75      0.63      0.66       593



Entrenando modelo XGBoost
/home/jordan/.local/lib/python3.10/site-packages/xgboost/sklearn.py:835: UserWarning: `eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.
  warnings.warn(
Resultados para XGBoost:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       456
           1       0.91      0.90      0.90       137

    accuracy                           0.96       593
   macro avg       0.94      0.94      0.94       593
weighted avg       0.96      0.96      0.96       593



Entrenando modelo LightGBM
Resultados para LightGBM:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       456
           1       0.89      0.91      0.90       137

    accuracy                           0.95       593
   macro avg       0.93      0.94      0.93       593
weighted avg       0.95      0.95      0.95       593



In [805]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

# Función para trazar la curva ROC-AUC
def plot_roc_curve(y_true, y_pred_probs, model_name):
    auc = roc_auc_score(y_true, y_pred_probs)
    fpr, tpr, _ = roc_curve(y_true, y_pred_probs)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.grid(True)

# Entrenar y evaluar modelos
for name, classifier in classifiers.items():
    print(f"Entrenando modelo {name}")
    if name == "XGBoost" or name == "LightGBM":
        classifier.fit(X_train_smote, y_train_smote, eval_metric='logloss')
        y_pred_probs = classifier.predict_proba(X_test)[:, 1]
    else:
        classifier.fit(X_train_smote, y_train_smote)
        y_pred_probs = classifier.predict_proba(X_test)[:, 1]
    
    # Trazar la curva ROC-AUC
    plot_roc_curve(y_test, y_pred_probs, name)

# Mostrar la curva ROC-AUC para todos los modelos
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()
Entrenando modelo Random Forest
Entrenando modelo Logistic Regression
Entrenando modelo XGBoost
/home/jordan/.local/lib/python3.10/site-packages/xgboost/sklearn.py:835: UserWarning: `eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.
  warnings.warn(
Entrenando modelo LightGBM
In [ ]:
 
In [ ]:
 

Aplicación al conjunto de datos data_test¶

In [809]:
data_test1= data_test[common_columns_sorted]
In [ ]:
 
In [810]:
# Descartar las columnas mencionadas en una sola línea con inplace=True
data_test1.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
                         ,"FECHA_VIN","OFICINA_VIN","CANAL","FECHA"], inplace=True)
/tmp/ipykernel_147672/1368707283.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test1.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
In [811]:
data_test1
Out[811]:
FRAUDE VALOR HORA_AUX Canal1 DIASEM DIAMES SEXO SEGMENTO EDAD INGRESOS EGRESOS NROPAISES NROCIUDADES Dist_Mean_NAL Dist_HOY Dist_sum_NAL
0 NaN 42230.09 18 POS 5 15 F Personal Plus 46 20000000 10000000 1 1 NaN 4552.41 1.00
1 NaN 143202.65 20 POS 3 6 F Preferencial 56 11000000 4500000 3 2 614.04 4552.41 1228.07
2 NaN 243591.25 2 ATM_INT 0 17 F Personal Plus 33 9000000 4000000 3 7 138.88 5083.41 1944.35
3 NaN 238267.40 20 ATM_INT 5 8 F Personal Plus 53 2300000 500000 1 1 NaN 904.81 1.00
4 NaN 490403.58 13 ATM_INT 5 1 M Personal 0 0 0 1 1 NaN 4552.41 1.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 NaN 531534.03 13 POS 5 1 M Preferencial 51 19327667 3000000 1 4 56.29 4552.41 844.35
96 NaN 52035.08 11 POS 0 3 M Preferencial 40 35500000 2000000 2 3 25.25 971.23 151.52
97 NaN 18309.04 23 POS 5 15 M Personal Plus 43 3100000 2000000 1 2 61.45 4552.41 122.90
98 NaN 496906.75 20 ATM_INT 6 16 M Personal Plus 40 24000000 10500000 1 3 453.23 4552.41 1812.93
99 NaN 192825.50 20 POS 5 15 M Preferencial 36 47736000 3000000 2 9 113.45 4552.41 5218.81

100 rows × 16 columns

In [ ]:
 
In [812]:
# Codificación one-hot para las variables categóricas
data_encoded = pd.get_dummies(data_test1, columns=['Canal1', 'SEXO', 'SEGMENTO'])
In [813]:
data_encoded.isnull().sum()
Out[813]:
FRAUDE                    100
VALOR                       0
HORA_AUX                    0
DIASEM                      0
DIAMES                      0
EDAD                        0
INGRESOS                    0
EGRESOS                     0
NROPAISES                   0
NROCIUDADES                 0
Dist_Mean_NAL              21
Dist_HOY                    0
Dist_sum_NAL                0
Canal1_ATM_INT              0
Canal1_POS                  0
SEXO_F                      0
SEXO_M                      0
SEGMENTO_Emprendedor        0
SEGMENTO_PYME               0
SEGMENTO_Personal           0
SEGMENTO_Personal Plus      0
SEGMENTO_Preferencial       0
dtype: int64
In [814]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
In [815]:
data_encoded1 = data_encoded[['VALOR', 'HORA_AUX','DIAMES', 'EDAD', 'INGRESOS', 'EGRESOS',
       'Dist_Mean_NAL', 'Dist_sum_NAL', 'Canal1_ATM_INT', 'Canal1_POS']]
In [816]:
X_= data_encoded1
In [817]:
from sklearn.impute import KNNImputer

imputeKNN= KNNImputer(n_neighbors=2)

#SimpleImputer(strategy='mean')
X_imputed = imputeKNN.fit_transform(X_)
In [ ]:
 
In [818]:
X_imputed.shape
Out[818]:
(100, 10)
In [ ]:
 
In [ ]:
 
In [819]:
# Evaluar el modelo en el conjunto de prueba
y_pred = model.predict(X_imputed)
In [ ]:
 
In [820]:
#PREDICCIONES
y_pred
Out[820]:
array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
In [ ]:
 

En ningun momento se hace una reorganización de los datos utilizados para realizar la predicción por lo tanto están ordenados tal como estaban inicialmente¶

In [ ]:
 
In [821]:
data_test
Out[821]:
id FRAUDE VALOR HORA_AUX Dist_max_COL Dist_max_INTER Canal1 FECHA_FRAUDE COD_PAIS CANAL ... Dist_Mean_INTER Dist_Max_INTER NROCIUDADES Dist_Sum_NAL Dist_Mean_NAL Dist_HOY Dist_sum_NAL Dist_mean_NAL Dist_sum_INTER Dist_mean_INTER
0 98523068 NaN 42230.09 18 1.00 1.00 POS 20150515 US POS ... NaN NaN 1 NaN NaN 4552.41 1.00 1.00 1.00 1.00
1 300237898 NaN 143202.65 20 614.04 7632.97 POS 20150506 US MCI ... 6092.69 7632.97 2 1228.07 614.04 4552.41 1228.07 614.04 24370.75 6092.69
2 943273308 NaN 243591.25 2 286.84 2443.14 ATM_INT 20150517 EC ATM_INT ... 1743.52 2443.14 7 1944.35 138.88 5083.41 1944.35 138.88 6974.09 1743.52
3 951645809 NaN 238267.40 20 1.00 1.00 ATM_INT 20150508 EC ATM_INT ... NaN NaN 1 NaN NaN 904.81 1.00 1.00 1.00 1.00
4 963797516 NaN 490403.58 13 1.00 1.00 ATM_INT 20150501 US ATM_INT ... NaN NaN 1 NaN NaN 4552.41 1.00 1.00 1.00 1.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 9970518152 NaN 531534.03 13 340.09 1.00 POS 20150501 US POS ... NaN NaN 4 844.35 56.29 4552.41 844.35 56.29 1.00 1.00
96 9971748725 NaN 52035.08 11 28.59 4552.41 POS 20150503 AW POS ... 4552.41 4552.41 3 151.52 25.25 971.23 151.52 25.25 9104.82 4552.41
97 9979565282 NaN 18309.04 23 61.45 1.00 POS 20150515 US POS ... NaN NaN 2 122.90 61.45 4552.41 122.90 61.45 1.00 1.00
98 9979718478 NaN 496906.75 20 733.11 1.00 ATM_INT 20150516 US ATM_INT ... NaN NaN 3 1812.93 453.23 4552.41 1812.93 453.23 1.00 1.00
99 9998668320 NaN 192825.50 20 337.29 904.81 POS 20150515 US MCI ... 904.81 904.81 9 5218.81 113.45 4552.41 5218.81 113.45 1809.62 904.81

100 rows × 32 columns

In [822]:
data_test.loc[:,"FRAUDE"] = list(y_pred)
#df.loc[:, 'Column1']
/tmp/ipykernel_147672/3603763774.py:1: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  data_test.loc[:,"FRAUDE"] = list(y_pred)
In [823]:
#data_test1["Fraude"].shape
In [824]:
data_test
Out[824]:
id FRAUDE VALOR HORA_AUX Dist_max_COL Dist_max_INTER Canal1 FECHA_FRAUDE COD_PAIS CANAL ... Dist_Mean_INTER Dist_Max_INTER NROCIUDADES Dist_Sum_NAL Dist_Mean_NAL Dist_HOY Dist_sum_NAL Dist_mean_NAL Dist_sum_INTER Dist_mean_INTER
0 98523068 0 42230.09 18 1.00 1.00 POS 20150515 US POS ... NaN NaN 1 NaN NaN 4552.41 1.00 1.00 1.00 1.00
1 300237898 0 143202.65 20 614.04 7632.97 POS 20150506 US MCI ... 6092.69 7632.97 2 1228.07 614.04 4552.41 1228.07 614.04 24370.75 6092.69
2 943273308 1 243591.25 2 286.84 2443.14 ATM_INT 20150517 EC ATM_INT ... 1743.52 2443.14 7 1944.35 138.88 5083.41 1944.35 138.88 6974.09 1743.52
3 951645809 1 238267.40 20 1.00 1.00 ATM_INT 20150508 EC ATM_INT ... NaN NaN 1 NaN NaN 904.81 1.00 1.00 1.00 1.00
4 963797516 1 490403.58 13 1.00 1.00 ATM_INT 20150501 US ATM_INT ... NaN NaN 1 NaN NaN 4552.41 1.00 1.00 1.00 1.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 9970518152 0 531534.03 13 340.09 1.00 POS 20150501 US POS ... NaN NaN 4 844.35 56.29 4552.41 844.35 56.29 1.00 1.00
96 9971748725 0 52035.08 11 28.59 4552.41 POS 20150503 AW POS ... 4552.41 4552.41 3 151.52 25.25 971.23 151.52 25.25 9104.82 4552.41
97 9979565282 0 18309.04 23 61.45 1.00 POS 20150515 US POS ... NaN NaN 2 122.90 61.45 4552.41 122.90 61.45 1.00 1.00
98 9979718478 0 496906.75 20 733.11 1.00 ATM_INT 20150516 US ATM_INT ... NaN NaN 3 1812.93 453.23 4552.41 1812.93 453.23 1.00 1.00
99 9998668320 0 192825.50 20 337.29 904.81 POS 20150515 US MCI ... 904.81 904.81 9 5218.81 113.45 4552.41 5218.81 113.45 1809.62 904.81

100 rows × 32 columns

In [579]:
data_test
Out[579]:
id FRAUDE VALOR HORA_AUX Dist_max_COL Dist_max_INTER Canal1 FECHA_FRAUDE COD_PAIS CANAL ... Dist_Mean_INTER Dist_Max_INTER NROCIUDADES Dist_Sum_NAL Dist_Mean_NAL Dist_HOY Dist_sum_NAL Dist_mean_NAL Dist_sum_INTER Dist_mean_INTER
0 98523068 0 42230.09 18 1.00 1.00 POS 20150515 US POS ... NaN NaN 1 NaN NaN 4552.41 1.00 1.00 1.00 1.00
1 300237898 0 143202.65 20 614.04 7632.97 POS 20150506 US MCI ... 6092.69 7632.97 2 1228.07 614.04 4552.41 1228.07 614.04 24370.75 6092.69
2 943273308 1 243591.25 2 286.84 2443.14 ATM_INT 20150517 EC ATM_INT ... 1743.52 2443.14 7 1944.35 138.88 5083.41 1944.35 138.88 6974.09 1743.52
3 951645809 1 238267.40 20 1.00 1.00 ATM_INT 20150508 EC ATM_INT ... NaN NaN 1 NaN NaN 904.81 1.00 1.00 1.00 1.00
4 963797516 1 490403.58 13 1.00 1.00 ATM_INT 20150501 US ATM_INT ... NaN NaN 1 NaN NaN 4552.41 1.00 1.00 1.00 1.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 9970518152 0 531534.03 13 340.09 1.00 POS 20150501 US POS ... NaN NaN 4 844.35 56.29 4552.41 844.35 56.29 1.00 1.00
96 9971748725 0 52035.08 11 28.59 4552.41 POS 20150503 AW POS ... 4552.41 4552.41 3 151.52 25.25 971.23 151.52 25.25 9104.82 4552.41
97 9979565282 0 18309.04 23 61.45 1.00 POS 20150515 US POS ... NaN NaN 2 122.90 61.45 4552.41 122.90 61.45 1.00 1.00
98 9979718478 0 496906.75 20 733.11 1.00 ATM_INT 20150516 US ATM_INT ... NaN NaN 3 1812.93 453.23 4552.41 1812.93 453.23 1.00 1.00
99 9998668320 0 192825.50 20 337.29 904.81 POS 20150515 US MCI ... 904.81 904.81 9 5218.81 113.45 4552.41 5218.81 113.45 1809.62 904.81

100 rows × 32 columns

In [580]:
data_test.to_csv("test_evaluado.csv")  
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [774]:
 
Out[774]:
(2965, 22)
In [742]:
selected_indices
Out[742]:
array([ 0,  1,  3,  4,  5,  6,  9, 11, 12, 13])

## Anexo de variables escogidas utilizando feauture importance¶

In [825]:
import numpy as np

# Obtener índices de las características seleccionadas
selected_indices = np.where(feature_selector.get_support())[0]

# Obtener nombres de las características seleccionadas
selected_features = X.columns[selected_indices]

print("Características seleccionadas:")
print(selected_features)
selected_features.shape
Características seleccionadas:
Index(['VALOR', 'HORA_AUX', 'DIAMES', 'EDAD', 'INGRESOS', 'EGRESOS',
       'Dist_Mean_NAL', 'Dist_sum_NAL', 'Canal1_ATM_INT', 'Canal1_POS'],
      dtype='object')
Out[825]:
(10,)
In [826]:
import matplotlib.pyplot as plt

# Obtener la importancia de las características del clasificador de Bosques Aleatorios
feature_importances = lf.feature_importances_

# Crear un DataFrame para visualizar la importancia de las características
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'F-Score': feature_importances})

# Ordenar las características por su importancia de acuerdo al F-score
feature_importance_df.sort_values(by='F-Score', ascending=False, inplace=True)

# Visualizar la importancia de las características en un gráfico de barras
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['F-Score'], color='skyblue')
plt.xlabel('F-Score')
plt.ylabel('Característica')
plt.title('Importancia de las Características (F-Score)')
plt.gca().invert_yaxis()  # Invertir el eje y para mostrar la característica más importante en la parte superior
plt.show()
In [ ]: