import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
data= pd.read_csv("train.csv")
data_test= pd.read_csv("test.csv")
#data["FECHA"]
data_test["Dist_max_INTER"]#.columns
data_test.columns.values.shape
##.dtypes
data_test.columns.values#.columns
array(['id', 'FRAUDE', 'VALOR', 'HORA_AUX', 'Dist_max_COL',
'Dist_max_INTER', 'Canal1', 'FECHA_FRAUDE', 'COD_PAIS', 'CANAL',
'FECHA', 'DIASEM', 'DIAMES', 'FECHA_VIN', 'OFICINA_VIN', 'SEXO',
'SEGMENTO', 'EDAD', 'INGRESOS', 'EGRESOS', 'NROPAISES',
'Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',
'NROCIUDADES', 'Dist_Sum_NAL', 'Dist_Mean_NAL', 'Dist_HOY',
'Dist_sum_NAL', 'Dist_mean_NAL', 'Dist_sum_INTER',
'Dist_mean_INTER'], dtype=object)
#data[data["id"]==98523068]
data.columns.values
array(['id', 'FRAUDE', 'VALOR', 'HORA_AUX', 'Dist_max_NAL', 'Canal1',
'FECHA', 'COD_PAIS', 'CANAL', 'DIASEM', 'DIAMES', 'FECHA_VIN',
'OFICINA_VIN', 'SEXO', 'SEGMENTO', 'EDAD', 'INGRESOS', 'EGRESOS',
'NROPAISES', 'Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',
'NROCIUDADES', 'Dist_Mean_NAL', 'Dist_HOY', 'Dist_sum_NAL'],
dtype=object)
# Hay desbalanceo de datos, se puede aplicar el metodo de sobremuestreo con SMOTE
data["FRAUDE"].value_counts()
0 2234 1 731 Name: FRAUDE, dtype: int64
731/(2234+731)
0.24654300168634063
# Obtener la intersección de columnas
common_columns = np.intersect1d(data_test.columns.values,data.columns.values )
#print("Columnas comunes en ambos conjuntos de datos:")
#print(common_columns)
# Obtener la intersección de columnas
common_columns = np.intersect1d(data.columns.values, data_test.columns.values)
# Ordenar las columnas comunes en el mismo orden que el primer array
common_columns_sorted = data_test.columns.values[np.isin(data_test.columns.values, common_columns)]
print("Columnas comunes en ambos conjuntos de datos en el mismo orden que el primer array:")
print(common_columns_sorted)
Columnas comunes en ambos conjuntos de datos en el mismo orden que el primer array: ['id' 'FRAUDE' 'VALOR' 'HORA_AUX' 'Canal1' 'COD_PAIS' 'CANAL' 'FECHA' 'DIASEM' 'DIAMES' 'FECHA_VIN' 'OFICINA_VIN' 'SEXO' 'SEGMENTO' 'EDAD' 'INGRESOS' 'EGRESOS' 'NROPAISES' 'Dist_Sum_INTER' 'Dist_Mean_INTER' 'Dist_Max_INTER' 'NROCIUDADES' 'Dist_Mean_NAL' 'Dist_HOY' 'Dist_sum_NAL']
#data.drop([Dist_Sum_INTER,Dist_Mean_INTER ])
# Seleccionar solo las columnas comunes
data_train = data[common_columns_sorted]
data_train.isnull().sum()
id 0 FRAUDE 0 VALOR 0 HORA_AUX 0 Canal1 0 COD_PAIS 0 CANAL 0 FECHA 0 DIASEM 0 DIAMES 0 FECHA_VIN 24 OFICINA_VIN 24 SEXO 55 SEGMENTO 24 EDAD 24 INGRESOS 24 EGRESOS 24 NROPAISES 0 Dist_Sum_INTER 1547 Dist_Mean_INTER 1547 Dist_Max_INTER 1547 NROCIUDADES 0 Dist_Mean_NAL 457 Dist_HOY 0 Dist_sum_NAL 0 dtype: int64
data_train
| id | FRAUDE | VALOR | HORA_AUX | Canal1 | COD_PAIS | CANAL | FECHA | DIASEM | DIAMES | ... | INGRESOS | EGRESOS | NROPAISES | Dist_Sum_INTER | Dist_Mean_INTER | Dist_Max_INTER | NROCIUDADES | Dist_Mean_NAL | Dist_HOY | Dist_sum_NAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9000000001 | 1 | 0.00 | 13 | ATM_INT | US | ATM_INT | 20150501 | 5 | 1 | ... | 1200000.0 | 1200000.0 | 1 | NaN | NaN | NaN | 6 | 474.94 | 4552.41 | 5224.36 |
| 1 | 9000000002 | 1 | 0.00 | 17 | ATM_INT | US | ATM_INT | 20150515 | 5 | 15 | ... | 5643700.0 | 500000.0 | 1 | NaN | NaN | NaN | 5 | 289.99 | 4552.41 | 2029.90 |
| 2 | 9000000003 | 1 | 0.00 | 13 | ATM_INT | US | ATM_INT | 20150501 | 5 | 1 | ... | 1200000.0 | 1200000.0 | 1 | NaN | NaN | NaN | 6 | 474.94 | 4552.41 | 5224.36 |
| 3 | 9000000004 | 1 | 0.00 | 13 | ATM_INT | US | ATM_INT | 20150501 | 5 | 1 | ... | 1200000.0 | 1200000.0 | 1 | NaN | NaN | NaN | 6 | 474.94 | 4552.41 | 5224.36 |
| 4 | 9000000005 | 1 | 0.00 | 0 | ATM_INT | CR | ATM_INT | 20150510 | 0 | 10 | ... | 0.0 | 0.0 | 1 | NaN | NaN | NaN | 1 | NaN | 1482.35 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2960 | 622529101 | 1 | 993430.04 | 19 | POS | US | POS | 20150519 | 2 | 19 | ... | 103918285.0 | 95475378.0 | 4 | 8944.83 | 2236.21 | 3646.67 | 4 | 96.86 | 4552.41 | 484.30 |
| 2961 | 2043206272 | 0 | 9957.05 | 10 | POS | US | POS | 20150524 | 0 | 24 | ... | 23625000.0 | 5000000.0 | 3 | 27648.32 | 3949.76 | 4552.41 | 11 | 82.67 | 4552.41 | 2810.75 |
| 2962 | 2943206272 | 0 | 9957.05 | 10 | POS | US | POS | 20150524 | 0 | 24 | ... | 23625000.0 | 5000000.0 | 3 | 27648.32 | 3949.76 | 4552.41 | 11 | 82.67 | 4552.41 | 2810.75 |
| 2963 | 3136302872 | 0 | 996191.64 | 15 | POS | US | MCI | 20150513 | 3 | 13 | ... | 56666000.0 | 37600750.0 | 1 | NaN | NaN | NaN | 3 | 219.46 | 4552.41 | 1316.79 |
| 2964 | 1953178702 | 1 | 999276.60 | 16 | ATM_INT | CR | ATM_INT | 20150520 | 3 | 20 | ... | 12853000.0 | 6156000.0 | 1 | NaN | NaN | NaN | 1 | NaN | 1482.35 | 1.00 |
2965 rows × 25 columns
data_train.isnull().sum()
data_train["SEGMENTO"].value_counts()
Personal Plus 1527 Preferencial 958 Personal 174 Emprendedor 159 PYME 119 Empresarial 4 Name: SEGMENTO, dtype: int64
# Contar el número de transacciones con y sin fraude
fraude_counts = data['FRAUDE'].value_counts()
# Crear un gráfico de barras
plt.figure(figsize=(8, 6))
fraude_counts.plot(kind='bar', color=['blue', 'red'])
plt.title('Número de Transacciones con y sin Fraude')
plt.xlabel('Fraude')
plt.ylabel('Número de Transacciones')
plt.xticks(ticks=[0, 1], labels=['Sin Fraude', 'Con Fraude'], rotation=0)
plt.show()
# Contar el número de transacciones por día de la semana (DIASEM)
transactions_by_day = data_train.groupby('DIASEM')['id'].count()
# Contar el número de transacciones fraudulentas por día de la semana (DIASEM)
fraud_transactions_by_day = data_train[data_train['FRAUDE'] == 1].groupby('DIASEM')['id'].count()
# Contar el número de transacciones por día del mes (DIAMES)
transactions_by_date = data_train.groupby('DIAMES')['id'].count()
# Contar el número de transacciones fraudulentas por día del mes (DIAMES)
fraud_transactions_by_date = data_train[data_train['FRAUDE'] == 1].groupby('DIAMES')['id'].count()
# Crear figura y ejes
fig, axs = plt.subplots(2, 1, figsize=(10, 10))
# Gráfico de barras apiladas para transacciones por día de la semana
#transactions_by_day.plot(kind='bar', color='blue', ax=axs[0], label='Transacciones')
fraud_transactions_by_day.plot(kind='bar', color='red', ax=axs[0], label='Fraudulentas')
axs[0].set_title('Transacciones por Día de la Semana')
axs[0].set_xlabel('Día de la Semana')
axs[0].set_ylabel('Número de Transacciones')
axs[0].legend()
# Gráfico de barras apiladas para transacciones por día del mes
fraud_transactions_by_date.plot(kind='bar', color='red', ax=axs[1], label='Fraudulentas')
axs[1].set_title('Transacciones por Día del Mes')
axs[1].set_xlabel('Día del Mes')
axs[1].set_ylabel('Número de Transacciones')
axs[1].legend()
plt.tight_layout()
plt.show()
# Descartar las columnas mencionadas en una sola línea con inplace=True
data_train.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
,"FECHA_VIN","OFICINA_VIN","CANAL","FECHA"], inplace=True)
/tmp/ipykernel_147672/3244588327.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_train.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
#print(data_train["CANAL"].value_counts())
print(data_train["Canal1"].value_counts())
data_train.isnull().sum()
POS 2329 ATM_INT 636 Name: Canal1, dtype: int64
FRAUDE 0 VALOR 0 HORA_AUX 0 Canal1 0 DIASEM 0 DIAMES 0 SEXO 55 SEGMENTO 24 EDAD 24 INGRESOS 24 EGRESOS 24 NROPAISES 0 NROCIUDADES 0 Dist_Mean_NAL 457 Dist_HOY 0 Dist_sum_NAL 0 dtype: int64
# Codificación one-hot para las variables categóricas
data_encoded = pd.get_dummies(data_train, columns=['Canal1', 'SEXO', 'SEGMENTO'])
# Ver las primeras filas del conjunto de datos codificado
#Datos que se utilizaran para el modelo
data_encoded
| FRAUDE | VALOR | HORA_AUX | DIASEM | DIAMES | EDAD | INGRESOS | EGRESOS | NROPAISES | NROCIUDADES | ... | Canal1_ATM_INT | Canal1_POS | SEXO_F | SEXO_M | SEGMENTO_Emprendedor | SEGMENTO_Empresarial | SEGMENTO_PYME | SEGMENTO_Personal | SEGMENTO_Personal Plus | SEGMENTO_Preferencial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.00 | 13 | 5 | 1 | 29.0 | 1200000.0 | 1200000.0 | 1 | 6 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 1 | 0.00 | 17 | 5 | 15 | 29.0 | 5643700.0 | 500000.0 | 1 | 5 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 1 | 0.00 | 13 | 5 | 1 | 29.0 | 1200000.0 | 1200000.0 | 1 | 6 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 1 | 0.00 | 13 | 5 | 1 | 29.0 | 1200000.0 | 1200000.0 | 1 | 6 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 1 | 0.00 | 0 | 0 | 10 | 25.0 | 0.0 | 0.0 | 1 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2960 | 1 | 993430.04 | 19 | 2 | 19 | 48.0 | 103918285.0 | 95475378.0 | 4 | 4 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2961 | 0 | 9957.05 | 10 | 0 | 24 | 35.0 | 23625000.0 | 5000000.0 | 3 | 11 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2962 | 0 | 9957.05 | 10 | 0 | 24 | 35.0 | 23625000.0 | 5000000.0 | 3 | 11 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2963 | 0 | 996191.64 | 15 | 3 | 13 | 34.0 | 56666000.0 | 37600750.0 | 1 | 3 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2964 | 1 | 999276.60 | 16 | 3 | 20 | 29.0 | 12853000.0 | 6156000.0 | 1 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2965 rows × 23 columns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
# Realizar selección de características utilizando la importancia de características
#X = data_encoded.drop(columns=['FRAUDE'])
#y = data_encoded['FRAUDE']
#clf = RandomForestClassifier(n_estimators=100, random_state=42)
#clf.fit(X, y)
#feature_selector = SelectFromModel(clf, prefit=True)
#X_selected = feature_selector.transform(X)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
# Realizar selección de características utilizando la importancia de características
X = data_encoded.drop(columns=['FRAUDE'])
y = data_encoded['FRAUDE']
# Imputar valores faltantes
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# Realizar selección de características utilizando la importancia de características
lf = RandomForestClassifier(n_estimators=100, random_state=42)
lf.fit(X_imputed, y)
feature_selector = SelectFromModel(lf, prefit=True)
X_selected = feature_selector.transform(X_imputed)
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
# Aplicar SMOTE al conjunto de entrenamiento (opcional)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
# Entrenar el modelo de detección de fraude
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Evaluar el modelo en el conjunto de prueba
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.97 0.98 0.97 456
1 0.92 0.91 0.91 137
accuracy 0.96 593
macro avg 0.95 0.94 0.94 593
weighted avg 0.96 0.96 0.96 593
X_train.shape
X_test.shape
X_train_smote.shape
(3556, 10)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
# Modelos de clasificación
classifiers = {
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"Logistic Regression": LogisticRegression(random_state=42),
"XGBoost": xgb.XGBClassifier(random_state=42),
"LightGBM": lgb.LGBMClassifier(random_state=42)
}
# Entrenamiento y evaluación de modelos
for name, classifier in classifiers.items():
print(f"Entrenando modelo {name}")
if name == "XGBoost" or name == "LightGBM":
classifier.fit(X_train_smote, y_train_smote, eval_metric='logloss')
else:
classifier.fit(X_train_smote, y_train_smote)
# Predicción y evaluación en conjunto de prueba
y_pred = classifier.predict(X_test)
print(f"Resultados para {name}:")
print(classification_report(y_test, y_pred))
print("\n")
Entrenando modelo Random Forest
Resultados para Random Forest:
precision recall f1-score support
0 0.97 0.96 0.96 456
1 0.86 0.90 0.88 137
accuracy 0.94 593
macro avg 0.91 0.93 0.92 593
weighted avg 0.94 0.94 0.94 593
Entrenando modelo Logistic Regression
Resultados para Logistic Regression:
precision recall f1-score support
0 0.87 0.61 0.72 456
1 0.35 0.69 0.46 137
accuracy 0.63 593
macro avg 0.61 0.65 0.59 593
weighted avg 0.75 0.63 0.66 593
Entrenando modelo XGBoost
/home/jordan/.local/lib/python3.10/site-packages/xgboost/sklearn.py:835: UserWarning: `eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead. warnings.warn(
Resultados para XGBoost:
precision recall f1-score support
0 0.97 0.97 0.97 456
1 0.91 0.90 0.90 137
accuracy 0.96 593
macro avg 0.94 0.94 0.94 593
weighted avg 0.96 0.96 0.96 593
Entrenando modelo LightGBM
Resultados para LightGBM:
precision recall f1-score support
0 0.97 0.96 0.97 456
1 0.89 0.91 0.90 137
accuracy 0.95 593
macro avg 0.93 0.94 0.93 593
weighted avg 0.95 0.95 0.95 593
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
# Función para trazar la curva ROC-AUC
def plot_roc_curve(y_true, y_pred_probs, model_name):
auc = roc_auc_score(y_true, y_pred_probs)
fpr, tpr, _ = roc_curve(y_true, y_pred_probs)
plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
# Entrenar y evaluar modelos
for name, classifier in classifiers.items():
print(f"Entrenando modelo {name}")
if name == "XGBoost" or name == "LightGBM":
classifier.fit(X_train_smote, y_train_smote, eval_metric='logloss')
y_pred_probs = classifier.predict_proba(X_test)[:, 1]
else:
classifier.fit(X_train_smote, y_train_smote)
y_pred_probs = classifier.predict_proba(X_test)[:, 1]
# Trazar la curva ROC-AUC
plot_roc_curve(y_test, y_pred_probs, name)
# Mostrar la curva ROC-AUC para todos los modelos
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()
Entrenando modelo Random Forest Entrenando modelo Logistic Regression Entrenando modelo XGBoost
/home/jordan/.local/lib/python3.10/site-packages/xgboost/sklearn.py:835: UserWarning: `eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead. warnings.warn(
Entrenando modelo LightGBM
data_test1= data_test[common_columns_sorted]
# Descartar las columnas mencionadas en una sola línea con inplace=True
data_test1.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
,"FECHA_VIN","OFICINA_VIN","CANAL","FECHA"], inplace=True)
/tmp/ipykernel_147672/1368707283.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_test1.drop(columns=['Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Max_INTER',"COD_PAIS","id"
data_test1
| FRAUDE | VALOR | HORA_AUX | Canal1 | DIASEM | DIAMES | SEXO | SEGMENTO | EDAD | INGRESOS | EGRESOS | NROPAISES | NROCIUDADES | Dist_Mean_NAL | Dist_HOY | Dist_sum_NAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 42230.09 | 18 | POS | 5 | 15 | F | Personal Plus | 46 | 20000000 | 10000000 | 1 | 1 | NaN | 4552.41 | 1.00 |
| 1 | NaN | 143202.65 | 20 | POS | 3 | 6 | F | Preferencial | 56 | 11000000 | 4500000 | 3 | 2 | 614.04 | 4552.41 | 1228.07 |
| 2 | NaN | 243591.25 | 2 | ATM_INT | 0 | 17 | F | Personal Plus | 33 | 9000000 | 4000000 | 3 | 7 | 138.88 | 5083.41 | 1944.35 |
| 3 | NaN | 238267.40 | 20 | ATM_INT | 5 | 8 | F | Personal Plus | 53 | 2300000 | 500000 | 1 | 1 | NaN | 904.81 | 1.00 |
| 4 | NaN | 490403.58 | 13 | ATM_INT | 5 | 1 | M | Personal | 0 | 0 | 0 | 1 | 1 | NaN | 4552.41 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | NaN | 531534.03 | 13 | POS | 5 | 1 | M | Preferencial | 51 | 19327667 | 3000000 | 1 | 4 | 56.29 | 4552.41 | 844.35 |
| 96 | NaN | 52035.08 | 11 | POS | 0 | 3 | M | Preferencial | 40 | 35500000 | 2000000 | 2 | 3 | 25.25 | 971.23 | 151.52 |
| 97 | NaN | 18309.04 | 23 | POS | 5 | 15 | M | Personal Plus | 43 | 3100000 | 2000000 | 1 | 2 | 61.45 | 4552.41 | 122.90 |
| 98 | NaN | 496906.75 | 20 | ATM_INT | 6 | 16 | M | Personal Plus | 40 | 24000000 | 10500000 | 1 | 3 | 453.23 | 4552.41 | 1812.93 |
| 99 | NaN | 192825.50 | 20 | POS | 5 | 15 | M | Preferencial | 36 | 47736000 | 3000000 | 2 | 9 | 113.45 | 4552.41 | 5218.81 |
100 rows × 16 columns
# Codificación one-hot para las variables categóricas
data_encoded = pd.get_dummies(data_test1, columns=['Canal1', 'SEXO', 'SEGMENTO'])
data_encoded.isnull().sum()
FRAUDE 100 VALOR 0 HORA_AUX 0 DIASEM 0 DIAMES 0 EDAD 0 INGRESOS 0 EGRESOS 0 NROPAISES 0 NROCIUDADES 0 Dist_Mean_NAL 21 Dist_HOY 0 Dist_sum_NAL 0 Canal1_ATM_INT 0 Canal1_POS 0 SEXO_F 0 SEXO_M 0 SEGMENTO_Emprendedor 0 SEGMENTO_PYME 0 SEGMENTO_Personal 0 SEGMENTO_Personal Plus 0 SEGMENTO_Preferencial 0 dtype: int64
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
data_encoded1 = data_encoded[['VALOR', 'HORA_AUX','DIAMES', 'EDAD', 'INGRESOS', 'EGRESOS',
'Dist_Mean_NAL', 'Dist_sum_NAL', 'Canal1_ATM_INT', 'Canal1_POS']]
X_= data_encoded1
from sklearn.impute import KNNImputer
imputeKNN= KNNImputer(n_neighbors=2)
#SimpleImputer(strategy='mean')
X_imputed = imputeKNN.fit_transform(X_)
X_imputed.shape
(100, 10)
# Evaluar el modelo en el conjunto de prueba
y_pred = model.predict(X_imputed)
#PREDICCIONES
y_pred
array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
data_test
| id | FRAUDE | VALOR | HORA_AUX | Dist_max_COL | Dist_max_INTER | Canal1 | FECHA_FRAUDE | COD_PAIS | CANAL | ... | Dist_Mean_INTER | Dist_Max_INTER | NROCIUDADES | Dist_Sum_NAL | Dist_Mean_NAL | Dist_HOY | Dist_sum_NAL | Dist_mean_NAL | Dist_sum_INTER | Dist_mean_INTER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 98523068 | NaN | 42230.09 | 18 | 1.00 | 1.00 | POS | 20150515 | US | POS | ... | NaN | NaN | 1 | NaN | NaN | 4552.41 | 1.00 | 1.00 | 1.00 | 1.00 |
| 1 | 300237898 | NaN | 143202.65 | 20 | 614.04 | 7632.97 | POS | 20150506 | US | MCI | ... | 6092.69 | 7632.97 | 2 | 1228.07 | 614.04 | 4552.41 | 1228.07 | 614.04 | 24370.75 | 6092.69 |
| 2 | 943273308 | NaN | 243591.25 | 2 | 286.84 | 2443.14 | ATM_INT | 20150517 | EC | ATM_INT | ... | 1743.52 | 2443.14 | 7 | 1944.35 | 138.88 | 5083.41 | 1944.35 | 138.88 | 6974.09 | 1743.52 |
| 3 | 951645809 | NaN | 238267.40 | 20 | 1.00 | 1.00 | ATM_INT | 20150508 | EC | ATM_INT | ... | NaN | NaN | 1 | NaN | NaN | 904.81 | 1.00 | 1.00 | 1.00 | 1.00 |
| 4 | 963797516 | NaN | 490403.58 | 13 | 1.00 | 1.00 | ATM_INT | 20150501 | US | ATM_INT | ... | NaN | NaN | 1 | NaN | NaN | 4552.41 | 1.00 | 1.00 | 1.00 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | 9970518152 | NaN | 531534.03 | 13 | 340.09 | 1.00 | POS | 20150501 | US | POS | ... | NaN | NaN | 4 | 844.35 | 56.29 | 4552.41 | 844.35 | 56.29 | 1.00 | 1.00 |
| 96 | 9971748725 | NaN | 52035.08 | 11 | 28.59 | 4552.41 | POS | 20150503 | AW | POS | ... | 4552.41 | 4552.41 | 3 | 151.52 | 25.25 | 971.23 | 151.52 | 25.25 | 9104.82 | 4552.41 |
| 97 | 9979565282 | NaN | 18309.04 | 23 | 61.45 | 1.00 | POS | 20150515 | US | POS | ... | NaN | NaN | 2 | 122.90 | 61.45 | 4552.41 | 122.90 | 61.45 | 1.00 | 1.00 |
| 98 | 9979718478 | NaN | 496906.75 | 20 | 733.11 | 1.00 | ATM_INT | 20150516 | US | ATM_INT | ... | NaN | NaN | 3 | 1812.93 | 453.23 | 4552.41 | 1812.93 | 453.23 | 1.00 | 1.00 |
| 99 | 9998668320 | NaN | 192825.50 | 20 | 337.29 | 904.81 | POS | 20150515 | US | MCI | ... | 904.81 | 904.81 | 9 | 5218.81 | 113.45 | 4552.41 | 5218.81 | 113.45 | 1809.62 | 904.81 |
100 rows × 32 columns
data_test.loc[:,"FRAUDE"] = list(y_pred)
#df.loc[:, 'Column1']
/tmp/ipykernel_147672/3603763774.py:1: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)` data_test.loc[:,"FRAUDE"] = list(y_pred)
#data_test1["Fraude"].shape
data_test
| id | FRAUDE | VALOR | HORA_AUX | Dist_max_COL | Dist_max_INTER | Canal1 | FECHA_FRAUDE | COD_PAIS | CANAL | ... | Dist_Mean_INTER | Dist_Max_INTER | NROCIUDADES | Dist_Sum_NAL | Dist_Mean_NAL | Dist_HOY | Dist_sum_NAL | Dist_mean_NAL | Dist_sum_INTER | Dist_mean_INTER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 98523068 | 0 | 42230.09 | 18 | 1.00 | 1.00 | POS | 20150515 | US | POS | ... | NaN | NaN | 1 | NaN | NaN | 4552.41 | 1.00 | 1.00 | 1.00 | 1.00 |
| 1 | 300237898 | 0 | 143202.65 | 20 | 614.04 | 7632.97 | POS | 20150506 | US | MCI | ... | 6092.69 | 7632.97 | 2 | 1228.07 | 614.04 | 4552.41 | 1228.07 | 614.04 | 24370.75 | 6092.69 |
| 2 | 943273308 | 1 | 243591.25 | 2 | 286.84 | 2443.14 | ATM_INT | 20150517 | EC | ATM_INT | ... | 1743.52 | 2443.14 | 7 | 1944.35 | 138.88 | 5083.41 | 1944.35 | 138.88 | 6974.09 | 1743.52 |
| 3 | 951645809 | 1 | 238267.40 | 20 | 1.00 | 1.00 | ATM_INT | 20150508 | EC | ATM_INT | ... | NaN | NaN | 1 | NaN | NaN | 904.81 | 1.00 | 1.00 | 1.00 | 1.00 |
| 4 | 963797516 | 1 | 490403.58 | 13 | 1.00 | 1.00 | ATM_INT | 20150501 | US | ATM_INT | ... | NaN | NaN | 1 | NaN | NaN | 4552.41 | 1.00 | 1.00 | 1.00 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | 9970518152 | 0 | 531534.03 | 13 | 340.09 | 1.00 | POS | 20150501 | US | POS | ... | NaN | NaN | 4 | 844.35 | 56.29 | 4552.41 | 844.35 | 56.29 | 1.00 | 1.00 |
| 96 | 9971748725 | 0 | 52035.08 | 11 | 28.59 | 4552.41 | POS | 20150503 | AW | POS | ... | 4552.41 | 4552.41 | 3 | 151.52 | 25.25 | 971.23 | 151.52 | 25.25 | 9104.82 | 4552.41 |
| 97 | 9979565282 | 0 | 18309.04 | 23 | 61.45 | 1.00 | POS | 20150515 | US | POS | ... | NaN | NaN | 2 | 122.90 | 61.45 | 4552.41 | 122.90 | 61.45 | 1.00 | 1.00 |
| 98 | 9979718478 | 0 | 496906.75 | 20 | 733.11 | 1.00 | ATM_INT | 20150516 | US | ATM_INT | ... | NaN | NaN | 3 | 1812.93 | 453.23 | 4552.41 | 1812.93 | 453.23 | 1.00 | 1.00 |
| 99 | 9998668320 | 0 | 192825.50 | 20 | 337.29 | 904.81 | POS | 20150515 | US | MCI | ... | 904.81 | 904.81 | 9 | 5218.81 | 113.45 | 4552.41 | 5218.81 | 113.45 | 1809.62 | 904.81 |
100 rows × 32 columns
data_test
| id | FRAUDE | VALOR | HORA_AUX | Dist_max_COL | Dist_max_INTER | Canal1 | FECHA_FRAUDE | COD_PAIS | CANAL | ... | Dist_Mean_INTER | Dist_Max_INTER | NROCIUDADES | Dist_Sum_NAL | Dist_Mean_NAL | Dist_HOY | Dist_sum_NAL | Dist_mean_NAL | Dist_sum_INTER | Dist_mean_INTER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 98523068 | 0 | 42230.09 | 18 | 1.00 | 1.00 | POS | 20150515 | US | POS | ... | NaN | NaN | 1 | NaN | NaN | 4552.41 | 1.00 | 1.00 | 1.00 | 1.00 |
| 1 | 300237898 | 0 | 143202.65 | 20 | 614.04 | 7632.97 | POS | 20150506 | US | MCI | ... | 6092.69 | 7632.97 | 2 | 1228.07 | 614.04 | 4552.41 | 1228.07 | 614.04 | 24370.75 | 6092.69 |
| 2 | 943273308 | 1 | 243591.25 | 2 | 286.84 | 2443.14 | ATM_INT | 20150517 | EC | ATM_INT | ... | 1743.52 | 2443.14 | 7 | 1944.35 | 138.88 | 5083.41 | 1944.35 | 138.88 | 6974.09 | 1743.52 |
| 3 | 951645809 | 1 | 238267.40 | 20 | 1.00 | 1.00 | ATM_INT | 20150508 | EC | ATM_INT | ... | NaN | NaN | 1 | NaN | NaN | 904.81 | 1.00 | 1.00 | 1.00 | 1.00 |
| 4 | 963797516 | 1 | 490403.58 | 13 | 1.00 | 1.00 | ATM_INT | 20150501 | US | ATM_INT | ... | NaN | NaN | 1 | NaN | NaN | 4552.41 | 1.00 | 1.00 | 1.00 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | 9970518152 | 0 | 531534.03 | 13 | 340.09 | 1.00 | POS | 20150501 | US | POS | ... | NaN | NaN | 4 | 844.35 | 56.29 | 4552.41 | 844.35 | 56.29 | 1.00 | 1.00 |
| 96 | 9971748725 | 0 | 52035.08 | 11 | 28.59 | 4552.41 | POS | 20150503 | AW | POS | ... | 4552.41 | 4552.41 | 3 | 151.52 | 25.25 | 971.23 | 151.52 | 25.25 | 9104.82 | 4552.41 |
| 97 | 9979565282 | 0 | 18309.04 | 23 | 61.45 | 1.00 | POS | 20150515 | US | POS | ... | NaN | NaN | 2 | 122.90 | 61.45 | 4552.41 | 122.90 | 61.45 | 1.00 | 1.00 |
| 98 | 9979718478 | 0 | 496906.75 | 20 | 733.11 | 1.00 | ATM_INT | 20150516 | US | ATM_INT | ... | NaN | NaN | 3 | 1812.93 | 453.23 | 4552.41 | 1812.93 | 453.23 | 1.00 | 1.00 |
| 99 | 9998668320 | 0 | 192825.50 | 20 | 337.29 | 904.81 | POS | 20150515 | US | MCI | ... | 904.81 | 904.81 | 9 | 5218.81 | 113.45 | 4552.41 | 5218.81 | 113.45 | 1809.62 | 904.81 |
100 rows × 32 columns
data_test.to_csv("test_evaluado.csv")
(2965, 22)
selected_indices
array([ 0, 1, 3, 4, 5, 6, 9, 11, 12, 13])
import numpy as np
# Obtener índices de las características seleccionadas
selected_indices = np.where(feature_selector.get_support())[0]
# Obtener nombres de las características seleccionadas
selected_features = X.columns[selected_indices]
print("Características seleccionadas:")
print(selected_features)
selected_features.shape
Características seleccionadas:
Index(['VALOR', 'HORA_AUX', 'DIAMES', 'EDAD', 'INGRESOS', 'EGRESOS',
'Dist_Mean_NAL', 'Dist_sum_NAL', 'Canal1_ATM_INT', 'Canal1_POS'],
dtype='object')
(10,)
import matplotlib.pyplot as plt
# Obtener la importancia de las características del clasificador de Bosques Aleatorios
feature_importances = lf.feature_importances_
# Crear un DataFrame para visualizar la importancia de las características
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'F-Score': feature_importances})
# Ordenar las características por su importancia de acuerdo al F-score
feature_importance_df.sort_values(by='F-Score', ascending=False, inplace=True)
# Visualizar la importancia de las características en un gráfico de barras
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['F-Score'], color='skyblue')
plt.xlabel('F-Score')
plt.ylabel('Característica')
plt.title('Importancia de las Características (F-Score)')
plt.gca().invert_yaxis() # Invertir el eje y para mostrar la característica más importante en la parte superior
plt.show()