import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
# Load dataset
data = pd.read_csv("Customer.csv")
# Define features and target
X = data[['Total Purchases', 'Last Purchase (Days Ago)', 'Support Tickets', 'Avg Order Value']]
y = data['Subscription Status']
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1] # Probability of churn
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
# Print performance metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC-ROC Score: {roc_auc:.2f}")
# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# Print detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))
# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--') # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()
Accuracy: 0.98 Precision: 1.00 Recall: 0.97 F1 Score: 0.98 AUC-ROC Score: 0.99 Confusion Matrix: [[46 0] [ 3 98]] Classification Report: precision recall f1-score support 0 0.94 1.00 0.97 46 1 1.00 0.97 0.98 101 accuracy 0.98 147 macro avg 0.97 0.99 0.98 147 weighted avg 0.98 0.98 0.98 147
# Example new customer input (ensure the order matches training features)
new_customer = pd.DataFrame([[10, 7, 9, 300]], columns=X.columns)
new_customer2 = pd.DataFrame([[2, 3, 15, 100]], columns=X.columns)
X = data[['Total Purchases', 'Last Purchase (Days Ago)', 'Support Tickets', 'Avg Order Value']]
# Standardize the new customer data
new_customer_scaled = scaler.transform(new_customer)
new_customer_scaled2 = scaler.transform(new_customer2)
# Predict churn probability
churn_probability = model.predict_proba(new_customer_scaled)[:, 0]
print(f"Churn Probability: {churn_probability[0]:.2f}")
# Predict churn probability
churn_probability2 = model.predict_proba(new_customer_scaled2)[:, 0]
print(f"Churn Probability2: {churn_probability2[0]:.2f}")
# Predict churn (1 = Churn, 0 = Stay)
churn_prediction = model.predict(new_customer_scaled)
print(f"Churn Prediction: {'Churn' if churn_prediction[0] == 1 else 'Stay'}")
# Predict churn (1 = Churn, 0 = Stay)
churn_prediction2 = model.predict(new_customer_scaled2)
print(f"Churn Prediction2: {'Churn' if churn_prediction2[0] == 1 else 'Stay'}")
Churn Probability: 0.16 Churn Probability2: 0.84 Churn Prediction: Churn Churn Prediction2: Stay
# Generate a new DataFrame of 100 customers with random data
np.random.seed(42) # For reproducibility
new_customers_df = pd.DataFrame({
"Total Purchases": np.random.randint(1, 65, 100), # Between 1 and 50 purchases
"Last Purchase (Days Ago)": np.random.randint(1, 365, 100), # Last purchase up to a year ago
"Support Tickets": np.random.randint(0, 11, 100), # 0 to 10 support tickets
"Avg Order Value": np.random.uniform(10, 500, 100) # Avg order value between $10 and $500
})
new_customers_df.head(5)
Total Purchases | Last Purchase (Days Ago) | Support Tickets | Avg Order Value | |
---|---|---|---|---|
0 | 39 | 191 | 10 | 491.262488 |
1 | 52 | 218 | 9 | 263.151587 |
2 | 29 | 44 | 6 | 137.806296 |
3 | 15 | 162 | 8 | 498.164313 |
4 | 43 | 202 | 6 | 483.055482 |
# Ensure we only use the original features
new_customers_features = new_customers_df[X.columns] # Select only training features
# Standardize the new customer dataset
new_customers_scaled = scaler.transform(new_customers_features)
# Predict churn probabilities (using [:, 0] since 0 = Churn)
new_customers_df["Churn Probability"] = model.predict_proba(new_customers_scaled)[:, 0]
# Predict churn labels (1 = Stay, 0 = Churn)
new_customers_df["Churn Prediction"] = model.predict(new_customers_scaled)
new_customers_df["Churn Prediction"] = new_customers_df["Churn Prediction"].map({1: "Stay", 0: "Churn"})
new_customers_df.head(5)
Total Purchases | Last Purchase (Days Ago) | Support Tickets | Avg Order Value | Churn Probability | Churn Prediction | |
---|---|---|---|---|---|---|
0 | 39 | 191 | 10 | 491.262488 | 0.227241 | Stay |
1 | 52 | 218 | 9 | 263.151587 | 0.010269 | Stay |
2 | 29 | 44 | 6 | 137.806296 | 0.001463 | Stay |
3 | 15 | 162 | 8 | 498.164313 | 0.741230 | Churn |
4 | 43 | 202 | 6 | 483.055482 | 0.021054 | Stay |
import matplotlib.pyplot as plt
# Define risk threshold
risk_threshold = 0.5
# Separate high-risk and low-risk customers
high_risk = new_customers_df[new_customers_df["Churn Probability"] > risk_threshold]
low_risk = new_customers_df[new_customers_df["Churn Probability"] <= risk_threshold]
# Plot scatter plot with color differentiation
plt.figure(figsize=(10, 5))
plt.scatter(low_risk.index, low_risk["Churn Probability"], color="blue", alpha=0.7, label="Low Risk")
plt.scatter(high_risk.index, high_risk["Churn Probability"], color="red", alpha=0.7, label="High Risk (Churn Likely)")
plt.axhline(y=risk_threshold, color="black", linestyle="--", label=f"Risk Threshold ({risk_threshold})")
# Labels and title
plt.xlabel("Customer Index")
plt.ylabel("Churn Probability")
plt.title("Churn Probability of Customers (High-Risk Highlighted)")
plt.ylim(0, 1) # Churn probabilities range from 0 to 1
plt.legend()
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 5))
plt.hist(new_customers_df["Churn Probability"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Churn Probability")
plt.ylabel("Number of Customers")
plt.title("Distribution of Predicted Churn Risk")
plt.show()
plt.figure(figsize=(6, 5))
plt.boxplot(new_customers_df["Churn Probability"], vert=True)
plt.ylabel("Churn Probability")
plt.title("Churn Probability Distribution")
plt.show()
# Select top 10 high-risk customers
high_risk_customers = new_customers_df.sort_values(by="Churn Probability", ascending=False).head(10)
# Plot
plt.figure(figsize=(10, 5))
plt.barh(high_risk_customers.index, high_risk_customers["Churn Probability"], color='red', alpha=0.7)
plt.xlabel("Churn Probability")
plt.ylabel("Customer Index")
plt.title("Top 10 High-Risk Customers")
plt.gca().invert_yaxis() # Flip to show highest risk at the top
plt.show()
high_risk_customers.head(10)
Total Purchases | Last Purchase (Days Ago) | Support Tickets | Avg Order Value | Churn Probability | Churn Prediction | |
---|---|---|---|---|---|---|
30 | 21 | 344 | 9 | 97.623128 | 0.879452 | Churn |
27 | 2 | 345 | 3 | 96.203374 | 0.864798 | Churn |
58 | 3 | 131 | 7 | 477.005205 | 0.850195 | Churn |
16 | 36 | 338 | 10 | 394.816919 | 0.831530 | Churn |
91 | 4 | 170 | 10 | 44.761049 | 0.831191 | Churn |
13 | 11 | 213 | 6 | 445.007348 | 0.791150 | Churn |
62 | 21 | 255 | 7 | 490.362476 | 0.788516 | Churn |
63 | 9 | 359 | 4 | 46.919665 | 0.780079 | Churn |
75 | 2 | 96 | 10 | 185.736923 | 0.775888 | Churn |
3 | 15 | 162 | 8 | 498.164313 | 0.741230 | Churn |
high_risk_customers.describe()
Total Purchases | Last Purchase (Days Ago) | Support Tickets | Avg Order Value | Churn Probability | |
---|---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 12.400000 | 241.300000 | 7.400000 | 277.660040 | 0.813403 |
std | 11.057426 | 100.080024 | 2.503331 | 199.055141 | 0.044527 |
min | 2.000000 | 96.000000 | 3.000000 | 44.761049 | 0.741230 |
25% | 3.250000 | 164.000000 | 6.250000 | 96.558312 | 0.782188 |
50% | 10.000000 | 234.000000 | 7.500000 | 290.276921 | 0.811170 |
75% | 19.500000 | 342.500000 | 9.750000 | 469.005741 | 0.845529 |
max | 36.000000 | 359.000000 | 10.000000 | 498.164313 | 0.879452 |
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
# Selecting two key features for visualization: "Total Purchases" and "Support Tickets"
X_selected = data[["Total Purchases", "Support Tickets"]]
y = data["Subscription Status"]
# Standardizing the selected features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Training logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Generate mesh grid for decision boundary visualization
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
# Predict probability for each point in the mesh grid
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
# Plot decision boundary
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, levels=20, cmap="coolwarm", alpha=0.7)
plt.colorbar(label="Probability of Staying Active")
# Scatter plot of actual data points
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, edgecolors="k", cmap="bwr", alpha=0.8)
# Labels and title
plt.xlabel("Total Purchases (Standardized)")
plt.ylabel("Support Tickets (Standardized)")
plt.title("Logistic Regression Decision Boundary - Churn Prediction")
plt.show()
# Feature names
features = ["Total Purchases", "Last Purchase (Days Ago)", "Support Tickets", "Avg Order Value"]
# Coefficients from logistic regression output
coefficients = [0.56425893, -0.04388535, -0.91195298, -0.00283536]
# Assign colors based on coefficient sign
colors = ["green" if coef > 0 else "red" for coef in coefficients]
# Create bar chart
plt.figure(figsize=(8, 5))
plt.bar(features, coefficients, color=colors)
plt.axhline(0, color="black", linewidth=1, linestyle="--") # Reference line at 0
plt.xlabel("Features")
plt.ylabel("Coefficient Value")
plt.title("Logistic Regression Feature Importance (Churn Prediction)")
plt.xticks(rotation=30, ha="right")
# Display the graph
plt.show()
# Count values
pi = data['Subscription Status'].value_counts()
# Plot pie chart
plt.pie(pi, labels=['Active','Non-Active'], autopct='%1.1f%%', pctdistance=0.6,
labeldistance=1.1, startangle=90)
plt.title('Active vs Canceled Subscriptions')
# Show the plot
plt.show()
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 735 entries, 0 to 734 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 735 non-null int64 1 Customer ID 735 non-null int64 2 Total Purchases 735 non-null int64 3 Last Purchase (Days Ago) 735 non-null int64 4 Support Tickets 735 non-null int64 5 Avg Order Value 735 non-null float64 6 Subscription Status 735 non-null int64 dtypes: float64(1), int64(6) memory usage: 40.3 KB
active= data[data['Subscription Status'] == 1]
active.describe()
Unnamed: 0 | Customer ID | Total Purchases | Last Purchase (Days Ago) | Support Tickets | Avg Order Value | Subscription Status | |
---|---|---|---|---|---|---|---|
count | 490.000000 | 490.000000 | 490.000000 | 490.000000 | 490.000000 | 490.000000 | 490.0 |
mean | 249.502041 | 1250.502041 | 50.440816 | 175.240816 | 4.381633 | 110.864224 | 1.0 |
std | 144.294054 | 144.294054 | 28.949600 | 104.151686 | 2.894758 | 51.976864 | 0.0 |
min | 0.000000 | 1001.000000 | 1.000000 | 1.000000 | 0.000000 | 20.190000 | 1.0 |
25% | 125.250000 | 1126.250000 | 25.000000 | 85.250000 | 2.000000 | 65.452500 | 1.0 |
50% | 250.500000 | 1251.500000 | 51.500000 | 178.000000 | 4.000000 | 114.050000 | 1.0 |
75% | 373.750000 | 1374.750000 | 73.750000 | 254.750000 | 7.000000 | 155.292500 | 1.0 |
max | 499.000000 | 1500.000000 | 99.000000 | 364.000000 | 9.000000 | 199.340000 | 1.0 |
nonactive= data[data['Subscription Status'] == 0]
nonactive.describe()
Unnamed: 0 | Customer ID | Total Purchases | Last Purchase (Days Ago) | Support Tickets | Avg Order Value | Subscription Status | |
---|---|---|---|---|---|---|---|
count | 245.000000 | 245.000000 | 245.000000 | 245.000000 | 245.000000 | 245.000000 | 245.0 |
mean | 251.767347 | 1252.767347 | 4.236735 | 284.216327 | 6.710204 | 144.443143 | 0.0 |
std | 155.324195 | 155.324195 | 2.512485 | 44.648423 | 1.406111 | 47.201468 | 0.0 |
min | 45.000000 | 1046.000000 | 1.000000 | 217.000000 | 5.000000 | 64.560000 | 0.0 |
25% | 72.000000 | 1073.000000 | 2.000000 | 228.000000 | 5.000000 | 115.870000 | 0.0 |
50% | 228.000000 | 1229.000000 | 3.000000 | 277.000000 | 7.000000 | 168.410000 | 0.0 |
75% | 399.000000 | 1400.000000 | 6.000000 | 317.000000 | 8.000000 | 179.070000 | 0.0 |
max | 481.000000 | 1482.000000 | 8.000000 | 350.000000 | 9.000000 | 193.180000 | 0.0 |
#make a new DF with churn prediction outputs and make dashboard + airtables
import pandas as pd
#adding low/med/high risk column
new_customers_df["Risk Segment"] = pd.cut(
new_customers_df["Churn Probability"],
bins=[0, 0.3, 0.6, 1],
labels=["Low Risk", "Medium Risk", "High Risk"]
)
new_customers_df.head(3)
Total Purchases | Last Purchase (Days Ago) | Support Tickets | Avg Order Value | Churn Probability | Churn Prediction | Risk Segment | |
---|---|---|---|---|---|---|---|
0 | 39 | 191 | 10 | 491.262488 | 0.227241 | Stay | Low Risk |
1 | 52 | 218 | 9 | 263.151587 | 0.010269 | Stay | Low Risk |
2 | 29 | 44 | 6 | 137.806296 | 0.001463 | Stay | Low Risk |
# Compute average values for each risk level
risk_averages = new_customers_df.groupby("Risk Segment")[["Total Purchases", "Last Purchase (Days Ago)", "Support Tickets", "Avg Order Value"]].mean()
# Plot the comparison
plt.figure(figsize=(10, 6))
risk_averages.T.plot(kind="bar", figsize=(10, 6))
plt.xlabel("Customer Attributes")
plt.ylabel("Average Value")
plt.title("Comparison of Low, Medium, and High-Risk Customers")
plt.xticks(rotation=45)
plt.legend(title="Risk Level")
plt.show()
<ipython-input-64-a86348ae090d>:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. risk_averages = new_customers_df.groupby("Risk Segment")[["Total Purchases", "Last Purchase (Days Ago)", "Support Tickets", "Avg Order Value"]].mean()
<Figure size 1000x600 with 0 Axes>
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Use the Elbow Method to determine the optimal number of clusters
inertia = []
cluster_range = range(1, 10) # Test clusters from 1 to 10
for k in cluster_range:
kmeans_test = KMeans(n_clusters=k, random_state=42)
kmeans_test.fit(new_customers_scaled)
inertia.append(kmeans_test.inertia_) # Inertia measures clustering quality
# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertia, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal Clusters")
plt.show()
# Choose an optimal cluster count (e.g., 3 if the elbow method suggests it)
optimal_k = 3 # Change this based on your elbow method results
# Train K-Means with the optimal cluster count
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
new_customers_df["Customer Segment"] = kmeans.fit_predict(new_customers_scaled)
from sklearn.decomposition import PCA
# Reduce to 2D for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(new_customers_scaled)
# Plot the clusters
plt.figure(figsize=(8, 5))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=new_customers_df["Customer Segment"], cmap="viridis", alpha=0.7)
plt.xlabel("PCA Component 1 (spending behavior)")
plt.ylabel("PCA Component 2 (customer service dependency)")
plt.title("Customer Segments (K-Means Clustering)")
plt.colorbar(label="Cluster ID")
plt.show()
# Print how much each original feature contributes to each PCA component
pca_components = pd.DataFrame(pca.components_, columns=X.columns, index=[f"PC{i+1}" for i in range(pca.n_components_)])
print(pca_components)
Total Purchases Last Purchase (Days Ago) Support Tickets \ PC1 0.041337 -0.029370 0.065743 PC2 -0.002778 0.155137 0.986043 Avg Order Value PC1 0.996547 PC2 -0.060363
π PC1 (First Principal Component) Avg Order Value (0.996547) contributes almost 100% to PC1. Support Tickets (0.065743) and Total Purchases (0.041337) have minimal impact. Last Purchase (Days Ago) (-0.029370) has a slight negative effect.
π‘ What PC1 Represents: π "Spending Behavior" β Customers with a high Avg Order Value will have high PC1 values. π Less affected by purchases or customer support interactions.
π PC2 (Second Principal Component) Support Tickets (0.986043) is the dominant factor in PC2. Last Purchase (Days Ago) (0.155137) has a small positive contribution. Total Purchases (-0.002778) and Avg Order Value (-0.060363) have almost no effect on PC2.
π‘ What PC2 Represents: π "Customer Service Dependency" β Higher PC2 values indicate customers with more support tickets. π Avg Order Value barely matters for this component.
π§ What This Means for Clusters If a customer has a high PC1 value, they are big spenders (higher Avg Order Value). If a customer has a high PC2 value, they contact support a lot. If a customer has both high PC1 and PC2 values, they are big spenders who also need support frequently. π On the PCA scatter plot:
Right-side customers (High PC1) β High-spending customers. Top-side customers (High PC2) β Customers with many support tickets. Clusters may separate "VIP buyers" from "frequent complainers."