Churn Forecasting Series: Large Scale End-to-End Churn Modeling


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
from scipy.io import arff

# Disable warnings
warnings.filterwarnings('ignore')

data, meta = arff.loadarff('large_churn_data')
data = pd.DataFrame(data)
data.iloc[:, :8]

yearmonthuser_account_iduser_lifetimeuser_intakeuser_no_outgoing_activity_in_daysuser_account_balance_lastuser_spendings
02013.06.013.01000.00.01.00.050.00
12013.06.014.01000.00.025.028.313.45
22013.06.018.01005.00.08.015.621.97
32013.06.027.01013.00.011.05.620.00
42013.06.032.01032.00.02.05.860.15
...........................
664642013.08.01196937.015947.00.01.00.000.00
664652013.08.01196938.015947.00.01.00.000.00
664662013.08.01196939.015947.00.01.015.000.00
664672013.08.01196940.015947.00.01.00.000.00
664682013.08.01196944.01.01.01.015.000.00

66469 rows × 8 columns


data["churn"].value_counts()
churn
0.0    52562
1.0    13907
Name: count, dtype: int64

len(data.index)
66469

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66469 entries, 0 to 66468
Data columns (total 66 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   year                                        66469 non-null  float64
 1   month                                       66469 non-null  float64
 2   user_account_id                             66469 non-null  float64
 3   user_lifetime                               66469 non-null  float64
 4   user_intake                                 66469 non-null  float64
 5   user_no_outgoing_activity_in_days           66469 non-null  float64
 6   user_account_balance_last                   66469 non-null  float64
 7   user_spendings                              66469 non-null  float64
 8   user_has_outgoing_calls                     66469 non-null  float64
 9   user_has_outgoing_sms                       66469 non-null  float64
 10  user_use_gprs                               66469 non-null  float64
 11  user_does_reload                            66469 non-null  float64
 12  reloads_inactive_days                       66469 non-null  float64
 13  reloads_count                               66469 non-null  float64
 14  reloads_sum                                 66469 non-null  float64
 15  calls_outgoing_count                        66469 non-null  float64
 16  calls_outgoing_spendings                    66469 non-null  float64
 17  calls_outgoing_duration                     66469 non-null  float64
 18  calls_outgoing_spendings_max                66469 non-null  float64
 19  calls_outgoing_duration_max                 66469 non-null  float64
 20  calls_outgoing_inactive_days                66469 non-null  float64
 21  calls_outgoing_to_onnet_count               66469 non-null  float64
 22  calls_outgoing_to_onnet_spendings           66469 non-null  float64
 23  calls_outgoing_to_onnet_duration            66469 non-null  float64
 24  calls_outgoing_to_onnet_inactive_days       66469 non-null  float64
 25  calls_outgoing_to_offnet_count              66469 non-null  float64
 26  calls_outgoing_to_offnet_spendings          66469 non-null  float64
 27  calls_outgoing_to_offnet_duration           66469 non-null  float64
 28  calls_outgoing_to_offnet_inactive_days      66469 non-null  float64
 29  calls_outgoing_to_abroad_count              66469 non-null  float64
 30  calls_outgoing_to_abroad_spendings          66469 non-null  float64
 31  calls_outgoing_to_abroad_duration           66469 non-null  float64
 32  calls_outgoing_to_abroad_inactive_days      66469 non-null  float64
 33  sms_outgoing_count                          66469 non-null  float64
 34  sms_outgoing_spendings                      66469 non-null  float64
 35  sms_outgoing_spendings_max                  66469 non-null  float64
 36  sms_outgoing_inactive_days                  66469 non-null  float64
 37  sms_outgoing_to_onnet_count                 66469 non-null  float64
 38  sms_outgoing_to_onnet_spendings             66469 non-null  float64
 39  sms_outgoing_to_onnet_inactive_days         66469 non-null  float64
 40  sms_outgoing_to_offnet_count                66469 non-null  float64
 41  sms_outgoing_to_offnet_spendings            66469 non-null  float64
 42  sms_outgoing_to_offnet_inactive_days        66469 non-null  float64
 43  sms_outgoing_to_abroad_count                66469 non-null  float64
 44  sms_outgoing_to_abroad_spendings            66469 non-null  float64
 45  sms_outgoing_to_abroad_inactive_days        66469 non-null  float64
 46  sms_incoming_count                          66469 non-null  float64
 47  sms_incoming_spendings                      66469 non-null  float64
 48  sms_incoming_from_abroad_count              66469 non-null  float64
 49  sms_incoming_from_abroad_spendings          66469 non-null  float64
 50  gprs_session_count                          66469 non-null  float64
 51  gprs_usage                                  66469 non-null  float64
 52  gprs_spendings                              66469 non-null  float64
 53  gprs_inactive_days                          66469 non-null  float64
 54  last_100_reloads_count                      66469 non-null  float64
 55  last_100_reloads_sum                        66469 non-null  float64
 56  last_100_calls_outgoing_duration            66469 non-null  float64
 57  last_100_calls_outgoing_to_onnet_duration   66469 non-null  float64
 58  last_100_calls_outgoing_to_offnet_duration  66469 non-null  float64
 59  last_100_calls_outgoing_to_abroad_duration  66469 non-null  float64
 60  last_100_sms_outgoing_count                 66469 non-null  float64
 61  last_100_sms_outgoing_to_onnet_count        66469 non-null  float64
 62  last_100_sms_outgoing_to_offnet_count       66469 non-null  float64
 63  last_100_sms_outgoing_to_abroad_count       66469 non-null  float64
 64  last_100_gprs_usage                         66469 non-null  float64
 65  churn                                       66469 non-null  float64
dtypes: float64(66)
memory usage: 33.5 MB

data.isnull().sum()
year                                     0
month                                    0
user_account_id                          0
user_lifetime                            0
user_intake                              0
                                        ..
last_100_sms_outgoing_to_onnet_count     0
last_100_sms_outgoing_to_offnet_count    0
last_100_sms_outgoing_to_abroad_count    0
last_100_gprs_usage                      0
churn                                    0
Length: 66, dtype: int64

data["churn"].value_counts()
churn
0.0    52562
1.0    13907
Name: count, dtype: int64

data.dtypes
year                                     float64
month                                    float64
user_account_id                          float64
user_lifetime                            float64
user_intake                              float64
                                          ...   
last_100_sms_outgoing_to_onnet_count     float64
last_100_sms_outgoing_to_offnet_count    float64
last_100_sms_outgoing_to_abroad_count    float64
last_100_gprs_usage                      float64
churn                                    float64
Length: 66, dtype: object

data.columns
Index(['year', 'month', 'user_account_id', 'user_lifetime', 'user_intake',
       'user_no_outgoing_activity_in_days', 'user_account_balance_last',
       'user_spendings', 'user_has_outgoing_calls', 'user_has_outgoing_sms',
       'user_use_gprs', 'user_does_reload', 'reloads_inactive_days',
       'reloads_count', 'reloads_sum', 'calls_outgoing_count',
       'calls_outgoing_spendings', 'calls_outgoing_duration',
       'calls_outgoing_spendings_max', 'calls_outgoing_duration_max',
       'calls_outgoing_inactive_days', 'calls_outgoing_to_onnet_count',
       'calls_outgoing_to_onnet_spendings', 'calls_outgoing_to_onnet_duration',
       'calls_outgoing_to_onnet_inactive_days',
       'calls_outgoing_to_offnet_count', 'calls_outgoing_to_offnet_spendings',
       'calls_outgoing_to_offnet_duration',
       'calls_outgoing_to_offnet_inactive_days',
       'calls_outgoing_to_abroad_count', 'calls_outgoing_to_abroad_spendings',
       'calls_outgoing_to_abroad_duration',
       'calls_outgoing_to_abroad_inactive_days', 'sms_outgoing_count',
       'sms_outgoing_spendings', 'sms_outgoing_spendings_max',
       'sms_outgoing_inactive_days', 'sms_outgoing_to_onnet_count',
       'sms_outgoing_to_onnet_spendings',
       'sms_outgoing_to_onnet_inactive_days', 'sms_outgoing_to_offnet_count',
       'sms_outgoing_to_offnet_spendings',
       'sms_outgoing_to_offnet_inactive_days', 'sms_outgoing_to_abroad_count',
       'sms_outgoing_to_abroad_spendings',
       'sms_outgoing_to_abroad_inactive_days', 'sms_incoming_count',
       'sms_incoming_spendings', 'sms_incoming_from_abroad_count',
       'sms_incoming_from_abroad_spendings', 'gprs_session_count',
       'gprs_usage', 'gprs_spendings', 'gprs_inactive_days',
       'last_100_reloads_count', 'last_100_reloads_sum',
       'last_100_calls_outgoing_duration',
       'last_100_calls_outgoing_to_onnet_duration',
       'last_100_calls_outgoing_to_offnet_duration',
       'last_100_calls_outgoing_to_abroad_duration',
       'last_100_sms_outgoing_count', 'last_100_sms_outgoing_to_onnet_count',
       'last_100_sms_outgoing_to_offnet_count',
       'last_100_sms_outgoing_to_abroad_count', 'last_100_gprs_usage',
       'churn'],
      dtype='object')

data.drop("user_account_id", inplace=True,axis=1)

data["churn"] = data["churn"].astype(int)
data["churn_string"] = data["churn"].map({1:"churn",0:"active"})

EDA:


columns_to_visualize = data.columns.tolist()
columns_to_visualize.remove("churn")
columns_to_visualize.remove("churn_string")


# Calculate the number of rows and columns for the subplot grid
num_columns = len(columns_to_visualize)
num_rows = (num_columns + 4) // 5  # Adjust 5 to change columns per row

plt.figure(figsize=(20, num_rows * 3))  # Adjust the figure size as needed

for i, column in enumerate(columns_to_visualize):
    plt.subplot(num_rows, 5, i+1)  # Dynamically set rows and columns
    sns.histplot(data, x=column, hue="churn_string", element="poly")
    plt.title(column)
    plt.tight_layout()

plt.show()

png


zeros_count = {col: (data[col] == 0).sum() for col in data.select_dtypes(include=['float64'])}
zeros_df = pd.DataFrame(list(zeros_count.items()), columns=['Column', 'Number of Zeros'])
plt.figure(figsize=(7, 13))  # Adjust the figure size as needed

sns.barplot(y='Column', x='Number of Zeros', data=zeros_df)
plt.title('Zeros Count')
plt.show()

png


counts = []
for col in data.select_dtypes(include=['float64']):
    counts.append({'Column': col, 'Count': (data[col] == 0).sum(), 'Type': 'Zeros'})
    counts.append({'Column': col, 'Count': (data[col] != 0).sum(), 'Type': 'Non-Zeros'})

plt.figure(figsize=(7, 13))  
count_df = pd.DataFrame(counts)
sns.barplot(y='Column', x='Count', hue='Type', data=count_df)
plt.title('Zeros and Non-Zeros Columns')
plt.show()

png


churn_counts = data['churn'].value_counts()
plt.pie(churn_counts, labels=churn_counts.index, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'salmon'])
plt.title('Churn Distribution')
plt.show()

png


corr = data.drop("churn_string",axis=1).corr()

# Creating a heatmap
plt.figure(figsize=(15, 15))
sns.heatmap(corr, annot=False,fmt='.2f',cmap="rocket")
plt.title('Correlation Heatmap')
plt.show()

png


spearman_corr = data.drop("churn_string",axis=1).corr(method='spearman')

plt.figure(figsize=(15, 15))
sns.heatmap(spearman_corr, annot=False,cmap="rocket",fmt='.2f')
plt.title('Pearson Correlation Heatmap')
plt.show()

png


data["churn_string"] = data["churn"].map({0: "active", 1: "churn"})

numerical_columns = data.columns.tolist()
numerical_columns.remove("churn")
numerical_columns.remove("churn_string")

num_columns = len(numerical_columns)
num_rows = (num_columns + 5) // 6  # 6 plots per row


fig, axs = plt.subplots(num_rows, 6, figsize=(20, 4*num_rows))  # Adjust figsize as needed


axs = axs.flatten()


for i, column in enumerate(numerical_columns):
    data.pivot(columns='churn_string', values=column).plot.hist(bins=40, ax=axs[i], title=column)
    axs[i].set_title(column)  # Set the title for each subplot


for i in range(len(numerical_columns), len(axs)):
    axs[i].set_visible(False)

plt.tight_layout()
plt.show()

data.drop("churn_string", axis=1, inplace=True)

png

Modeling:

Split the data:

data["churn"].value_counts()
churn
0    52562
1    13907
Name: count, dtype: int64

from sklearn import  model_selection

X_train, X_test, y_train, y_test= model_selection.train_test_split(
    data.drop(columns=['churn'],axis=1), data.churn, stratify=data.churn, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
((49851, 64), (16618, 64), (49851,), (16618,))

y_train.value_counts()
churn
0    39421
1    10430
Name: count, dtype: int64

y_test.value_counts()
churn
0    13141
1     3477
Name: count, dtype: int64
Initial Models:

import xgboost as xgb
xg = xgb.XGBClassifier()
xg.fit(X_train, y_train)
xg.score(X_test, y_test)
0.8762185581899146

from sklearn import linear_model, preprocessing
std = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(penalty=None)
lr.fit(std.fit_transform(X_train), y_train)
lr.score(std.transform(X_test), y_test)
0.8656878084005295

from sklearn import dummy

dum = dummy.DummyClassifier()
dum.fit(X_train, y_train)
dum.score(X_test, y_test)
0.7907690456131905

import xgboost as xgb
xg = xgb.XGBClassifier(early_stopping_rounds=30)
xg.fit(X_train, y_train, eval_set=[(X_train, y_train),
                                   (X_test, y_test)],
       verbose=10)
xg.score(X_test, y_test)
[0]	validation_0-logloss:0.40680	validation_1-logloss:0.40818
[10]	validation_0-logloss:0.27387	validation_1-logloss:0.29215
[20]	validation_0-logloss:0.26279	validation_1-logloss:0.29272
[30]	validation_0-logloss:0.25650	validation_1-logloss:0.29388
[40]	validation_0-logloss:0.24975	validation_1-logloss:0.29504
[42]	validation_0-logloss:0.24930	validation_1-logloss:0.29527





0.876940666746901

xg.best_iteration
12

results = xg.evals_result()

ax = (pd.DataFrame({'training': results['validation_0']['logloss'],
                    'testing': results['validation_1']['logloss']})
      .assign(ntrees=lambda adf: range(1, len(adf)+1))
      .set_index('ntrees')
      .plot(figsize=(5,4),
            title='eval_results with early_stopping')
      )
ax.annotate('Best number \nof trees (10)', xy=(20, .18),
            xytext=(34,.32), arrowprops={'color':'k'})
ax.set_xlabel('ntrees')
Text(0.5, 0, 'ntrees')

png


from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

cv = StratifiedKFold(n_splits=50)

model = xgb.XGBClassifier()
visualizer = LearningCurve(
    xgb.XGBClassifier(), cv=cv, scoring='f1', n_jobs=4
)

visualizer.fit(X_train, y_train)         
visualizer.show()

png

<Axes: title={'center': 'Learning Curve for XGBClassifier'}, xlabel='Training Instances', ylabel='Score'>

Evaluation


from sklearn import metrics
cm = metrics.confusion_matrix(y_test, xg.predict(X_test))
cm
array([[12251,   890],
       [ 1155,  2322]])

disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                                      display_labels=['Active', 'Churn'])
disp.plot(cmap='Blues')
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2a2476210>

png


# cmap on raw counts many be confusing
cm = metrics.confusion_matrix(y_test, xg.predict(X_test),
                              normalize='true')
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                                      display_labels=['Active', 'Churn'])
disp.plot(cmap='Blues')
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2a242d7d0>

png


from sklearn import metrics

print(metrics.classification_report(y_test, xg.predict(X_test),
                                    target_names=['Active', 'Churn']))
              precision    recall  f1-score   support

      Active       0.91      0.93      0.92     13141
       Churn       0.72      0.67      0.69      3477

    accuracy                           0.88     16618
   macro avg       0.82      0.80      0.81     16618
weighted avg       0.87      0.88      0.88     16618


print(metrics.classification_report(y_test, lr.predict(X_test),
                                    target_names=['N', 'Y']))
              precision    recall  f1-score   support

           N       0.95      0.47      0.63     13141
           Y       0.31      0.90      0.46      3477

    accuracy                           0.56     16618
   macro avg       0.63      0.69      0.55     16618
weighted avg       0.81      0.56      0.60     16618


from yellowbrick import classifier
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,4))
classifier.precision_recall_curve(xg, X_train, y_train,
                                  X_test, y_test, ax=ax, per_class=True)
ax.set_ylim((0,1.05))

png

(0.0, 1.05)

from yellowbrick.classifier import DiscriminationThreshold
 
model = xgb.XGBClassifier()
visualizer = DiscriminationThreshold(model)

visualizer.fit(X_train, y_train)
visualizer.show()

png

<Axes: title={'center': 'Threshold Plot for XGBClassifier'}, xlabel='discrimination threshold', ylabel='score'>

from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, roc_auc_score, accuracy_score


def metric_data(clf, X, y, metrics=['accuracy'], cv=4):
    res = pd.DataFrame(model_selection.cross_validate(clf, X, y, cv=cv, scoring=metrics,
                                                      return_train_score=True, ))
    return res


sample_size = len(data)
small_data = data.sample(sample_size).drop(columns=['churn'])
small_y = data.loc[small_data.index].churn == 1
scorers = {
    'f1_score': make_scorer(f1_score, average='micro'),
    'precision_score': make_scorer(precision_score, average='micro'),
    'recall_score': make_scorer(recall_score, average='micro'),
    'accuracy_score': make_scorer(accuracy_score),
    'roc_auc_score': make_scorer(roc_auc_score),
}

xg_data = metric_data(xgb.XGBClassifier(), small_data, small_y, metrics=scorers)
xg_data.iloc[:, :9]

fit_timescore_timetest_f1_scoretrain_f1_scoretest_precision_scoretrain_precision_scoretest_recall_scoretrain_recall_scoretest_accuracy_score
00.4930940.0213330.8771810.8971940.8771810.8971940.8771810.8971940.877181
10.5597880.0213690.8782570.8971150.8782570.8971150.8782570.8971150.878257
20.5395470.0220560.8710360.8999840.8710360.8999840.8710360.8999840.871036
30.5765290.0214370.8775350.8995030.8775350.8995030.8775350.8995030.877535


(xg_data
 .assign(type='xg')
 .groupby('type')
 .mean()
 .plot.bar()
 .legend(bbox_to_anchor=(1,1))
 )
<matplotlib.legend.Legend at 0x2a26a7150>

png

Tuning


from hyperopt import fmin, tpe, hp, Trials
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import accuracy_score, roc_auc_score

from typing import Any, Dict, Union

def hyperparameter_tuning(space: Dict[str, Union[float, int]],
                          X_train: pd.DataFrame, y_train: pd.Series,
                          X_test: pd.DataFrame, y_test: pd.Series,
                          early_stopping_rounds: int=50,
                          metric:callable=accuracy_score) -> Dict[str, Any]:

    int_vals = ['max_depth', 'reg_alpha']
    space = {k: (int(val) if k in int_vals else val)
             for k,val in space.items()}
    space['early_stopping_rounds'] = early_stopping_rounds
    model = xgb.XGBClassifier(**space)
    evaluation = [(X_train, y_train),
                  (X_test, y_test)]
    model.fit(X_train, y_train,
              eval_set=evaluation,
              verbose=False)

    pred = model.predict(X_test)
    score = metric(y_test, pred)
    return {'loss': -score, 'status': STATUS_OK, 'model': model}

params = {'random_state': 42, 'eval_metric': 'aucpr'}

rounds = [{'max_depth': hp.quniform('max_depth', 1, 8, 1),  # tree
           'min_child_weight': hp.loguniform('min_child_weight', -2, 3)},
          {'scale_pos_weight':hp.uniform('scale_pos_weight', 0, 10), # imbalanced
           'max_delta_step':hp.uniform('max_delta_step', 0, 10)},
          {'subsample': hp.uniform('subsample', 0.5, 1),   # stochastic
           'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)},
          {'reg_alpha': hp.uniform('reg_alpha', 0, 10),
           'reg_lambda': hp.uniform('reg_lambda', 1, 10),},
          {'gamma': hp.loguniform('gamma', -10, 10)}, # regularization
          {'learning_rate': hp.loguniform('learning_rate', -7, 0)} # boosting
          ]

all_trials = []
for round in rounds:
    params = {**params, **round}
    trials = Trials()
    best = fmin(fn=lambda space: hyperparameter_tuning(space, X_train,
                                                       y_train, X_test, y_test),
                space=params,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
                )
    params = {**params, **best}
    all_trials.append(trials)
100%|██████████| 20/20 [00:17<00:00,  1.14trial/s, best loss: -0.8778433024431339]
100%|██████████| 20/20 [00:17<00:00,  1.13trial/s, best loss: -0.876940666746901] 
100%|██████████| 20/20 [00:17<00:00,  1.14trial/s, best loss: -0.8773618967384763]
100%|██████████| 20/20 [00:22<00:00,  1.13s/trial, best loss: -0.8776627753038874]
100%|██████████| 20/20 [00:20<00:00,  1.03s/trial, best loss: -0.8776627753038874]
100%|██████████| 20/20 [00:24<00:00,  1.21s/trial, best loss: -0.8772415453123119]

params
{'random_state': 42,
 'eval_metric': 'aucpr',
 'max_depth': 6.0,
 'min_child_weight': 0.23193058123356144,
 'scale_pos_weight': 0.9019571775366886,
 'max_delta_step': 4.521025902082406,
 'subsample': 0.6927185598814062,
 'colsample_bytree': 0.9760587389780198,
 'reg_alpha': 9.42070439253806,
 'reg_lambda': 7.767016493057269,
 'gamma': 0.0026822919233543095,
 'learning_rate': 0.1369703361489519}

step_params = {'random_state': 42,
 'eval_metric': 'aucpr',
 'max_depth': 6,
 'min_child_weight': 0.23193058123356144,
 'scale_pos_weight': 0.9019571775366886,
 'max_delta_step': 4.521025902082406,
 'subsample': 0.6927185598814062,
 'colsample_bytree': 0.9760587389780198,
 'reg_alpha': 9.42070439253806,
 'reg_lambda': 7.767016493057269,
 'gamma': 0.0026822919233543095,
 'learning_rate': 0.1369703361489519}


import xgboost as xgb
xg_step = xgb.XGBClassifier(**step_params, early_stopping_rounds=100, n_estimators=250)
xg_step.fit(X_train, y_train, eval_set=[(X_train, y_train),
                                        (X_test, y_test)],
            verbose=100)
xg_step.score(X_test, y_test)
[0]	validation_0-aucpr:0.72961	validation_1-aucpr:0.71990
[100]	validation_0-aucpr:0.78069	validation_1-aucpr:0.76154
[200]	validation_0-aucpr:0.79322	validation_1-aucpr:0.76115
[230]	validation_0-aucpr:0.79634	validation_1-aucpr:0.76068





0.8768203153207366

xg_step.best_iteration
131

import xgboost as xgb
xg = xgb.XGBClassifier()
xg.fit(X_train, y_train)
xg.score(X_test, y_test)
0.8762185581899146

import matplotlib.pyplot as plt
from sklearn import metrics

fig, axes = plt.subplots(figsize=(8,4), ncols=2)
metrics.RocCurveDisplay.from_estimator(xg,
                                       X_train, y_train,ax=axes[0], label='detault train')
metrics.RocCurveDisplay.from_estimator(xg,
                                       X_test, y_test,ax=axes[0])#, label='default test')
axes[0].set(title='ROC plots for default model')

metrics.RocCurveDisplay.from_estimator(xg_step,
                                       X_train, y_train,ax=axes[1], label='step train')
metrics.RocCurveDisplay.from_estimator(xg_step,
                                       X_test, y_test,ax=axes[1])#, label='step test')
axes[1].set(title='ROC plots for stepwise model')
[Text(0.5, 1.0, 'ROC plots for stepwise model')]

png


from yellowbrick import classifier

fig, axes = plt.subplots(figsize=(8,4), ncols=2)
classifier.precision_recall_curve(xg, X_train, y_train,
                                  X_test, y_test, micro=False, macro=False, ax=axes[0], per_class=True,
                                  show=False)
axes[0].set_ylim((0,1.05))
axes[0].set(title='PR plots for default model')

classifier.precision_recall_curve(xg_step, X_train, y_train,
                                  X_test, y_test, micro=False, macro=False, ax=axes[1], per_class=True,
                                  show=False)
axes[1].set_ylim((0,1.05))
axes[1].set(title='PR plots for stepwise model')
[Text(0.5, 1.0, 'PR plots for stepwise model')]

png


from sklearn import metrics

cm = metrics.confusion_matrix(y_test, xg_step.predict(X_test),
                              normalize='true')
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                                      display_labels=['no', 'yes'])
disp.plot(cmap='Blues')
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2e01d6e10>

png

Interpreting


from sklearn import linear_model, preprocessing
std = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(penalty=None)
lr.fit(std.fit_transform(X_train), y_train)
lr.score(std.transform(X_test), y_test)
0.8656878084005295

feature_importances = pd.Series(xg_step.feature_importances_, index=X_train.columns).sort_values()
plt.figure(figsize=(8, 12)) 
feature_importances.plot.barh()
plt.show()

png


xgb.plot_tree(xg_step, num_trees=0, rankdir='LR')
<Axes: >

png


import dtreeviz

viz = dtreeviz.model(xg_step, X_train=X_train,
                     y_train=y_train,
                     target_name='Subscribed',
                     feature_names=list(X_train.columns), class_names=['Active', 'Churn'],
                     tree_index=0)
viz.view()

svg


import xgboost as xgb
xg_step = xgb.XGBClassifier(**step_params, early_stopping_rounds=20)
xg_step.fit(X_train, y_train, eval_set=[(X_train, y_train),
                                        (X_test, y_test)],
            verbose=100)
xg_step.score(X_test, y_test)
[0]	validation_0-aucpr:0.72961	validation_1-aucpr:0.71990
[91]	validation_0-aucpr:0.77963	validation_1-aucpr:0.76151





0.8762185581899146

import shap
shap.initjs()

shap_ex = shap.TreeExplainer(xg_step)
vals = shap_ex(X_test)

vals_df = pd.DataFrame(vals.values, columns=X_test.columns, index=X_test.index)

shap_ex.expected_value
-1.5311589

# index 40153 prediction
(vals_df.sum(axis='columns') + shap_ex.expected_value).pipe(lambda ser: ser[ser > 0])
60710    0.450789
64399    1.400459
62002    1.400459
62986    1.400459
25853    0.352904
           ...   
64617    0.785654
59634    0.494769
47023    1.122428
59216    0.078687
63871    1.400459
Length: 3072, dtype: float32

vals_df.loc[62986]
year                                     0.000000
month                                    0.120240
user_lifetime                            0.011200
user_intake                             -0.006299
user_no_outgoing_activity_in_days        0.106907
                                           ...   
last_100_sms_outgoing_count              0.047387
last_100_sms_outgoing_to_onnet_count     0.004574
last_100_sms_outgoing_to_offnet_count    0.035136
last_100_sms_outgoing_to_abroad_count   -0.001823
last_100_gprs_usage                     -0.000812
Name: 62986, Length: 64, dtype: float32

shap.plots.waterfall(vals[26])

png


shap.plots.waterfall(vals[28])

png


shap.plots.waterfall(vals[29])

png


shap.plots.beeswarm(vals, max_display=100)

png



feature_importances
year                                      0.000000
sms_incoming_from_abroad_spendings        0.000000
sms_outgoing_to_abroad_inactive_days      0.000000
sms_outgoing_to_offnet_inactive_days      0.000000
calls_outgoing_to_offnet_inactive_days    0.000000
                                            ...   
user_intake                               0.026111
calls_outgoing_duration                   0.030865
calls_outgoing_inactive_days              0.038314
last_100_calls_outgoing_duration          0.136882
user_spendings                            0.316150
Length: 64, dtype: float32

Constraints


 
shap.plots.partial_dependence(ind='user_spendings',
                              model=lambda rows: xg_step.predict_proba(rows)[:,-1],
                              data=X_train)

png


 
shap.plots.partial_dependence(ind='calls_outgoing_to_abroad_inactive_days',
                              model=lambda rows: xg_step.predict_proba(rows)[:,-1],
                              data=X_train)

png


 
shap.plots.partial_dependence(ind='last_100_calls_outgoing_duration',
                              model=lambda rows: xg_step.predict_proba(rows)[:,-1],
                              data=X_train)

png

 
shap.plots.partial_dependence(ind='calls_outgoing_inactive_days',
                              model=lambda rows: xg_step.predict_proba(rows)[:,-1],
                              data=X_train)

png


shap.plots.partial_dependence(ind='user_intake',
                              model=lambda rows: xg_step.predict_proba(rows)[:,-1],
                              data=X_train)

png



import xgboost as xgb

xg_const = xgb.XGBClassifier(**step_params, early_stopping_rounds=20,
                             monotone_constraints={'calls_outgoing_to_abroad_inactive_days': 1,
                                                   'last_100_calls_outgoing_duration': -1,
                                                   'calls_outgoing_inactive_days': -1, 'user_intake': -1,
                                                   "user_spendings": -1})
xg_const.fit(X_train, y_train, eval_set=[(X_train, y_train),
                                         (X_test, y_test)],
             verbose=100)
xg_const.score(X_test, y_test)
[0]	validation_0-aucpr:0.72500	validation_1-aucpr:0.71750
[99]	validation_0-aucpr:0.77979	validation_1-aucpr:0.76136





0.8765796124684078

xg_const = xgb.XGBClassifier(**step_params, early_stopping_rounds=20,
                             n_estimators=200,
                             monotone_constraints={'calls_outgoing_to_abroad_inactive_days': 1,
                                                   'last_100_calls_outgoing_duration': -1,
                                                   'calls_outgoing_inactive_days': -1, 'user_intake': -1,
                                                   "user_spendings": -1})
xg_const.fit(X_train, y_train, eval_set=[(X_train, y_train),
                                         (X_test, y_test)],
             verbose=100)
xg_const.score(X_test, y_test)
[0]	validation_0-aucpr:0.72500	validation_1-aucpr:0.71750
[100]	validation_0-aucpr:0.77987	validation_1-aucpr:0.76135
[111]	validation_0-aucpr:0.78168	validation_1-aucpr:0.76126





0.8765796124684078


from sklearn import metrics
metrics.roc_auc_score(y_test, xg_const.predict(X_test))
0.7917795192196179

xg_step.score(X_test, y_test)
0.8762185581899146

(pd.Series(xg_step.feature_importances_, index=X_train.columns)
 .sort_values()
 .iloc[-10:]
 .plot.barh()
 )
<Axes: >

png


(pd.Series(xg_const.feature_importances_, index=X_train.columns)
 .sort_values()
 .iloc[-10:]
 .plot.barh()
 )
<Axes: >

png


import xgboost as xgb
xg = xgb.XGBClassifier()
xg.fit(X_train, y_train)
xg.score(X_test, y_test)
0.8762185581899146

(pd.Series(xg.feature_importances_, index=X_train.columns)
 .sort_values()
 .iloc[-10:]
 .plot.barh()
 )
<Axes: >

png

Deployment


Ys = data["churn"]
Xs = data.drop("churn",axis=1)

step_params = {'random_state': 42,
               'eval_metric': 'aucpr',
               'max_depth': 7,
               'min_child_weight': 0.4180116655186252,
               'scale_pos_weight': 0.9905218835879426,
               'max_delta_step': 4.116066988118725,
               'subsample': 0.9368886996127623,
               'colsample_bytree': 0.647742541625645,
               'reg_alpha': 2.620031077025826,
               'reg_lambda': 3.2023850771623024,
               'gamma': 0.0010860710789745316,
               'learning_rate': 0.2684672626409679}


xg_const = xgb.XGBClassifier(**step_params, 
                             monotone_constraints={'calls_outgoing_to_abroad_inactive_days': 1,
                                                   'last_100_calls_outgoing_duration': -1,
                                                   'calls_outgoing_inactive_days': -1,
                                                   'user_intake': -1,
                                                   "user_spendings": -1})
xg_const.fit(Xs, Ys,verbose=100)

import mlflow
model_info = mlflow.xgboost.log_model(xg_const, artifact_path='artifact')

model_info.run_id
'e7de66e56ade4eb9b29eadaad731941b'

logged_model = 'mlruns/0/1cdf9ce7481b40d79408b5c23c0e700d/artifacts/artifact'
loaded_model = mlflow.pyfunc.load_model(logged_model)

loaded_model.predict(X_test)
array([0, 0, 0, ..., 0, 0, 0])