import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve

# Load dataset
data = pd.read_csv("Customer.csv")

# Define features and target
X = data[['Total Purchases', 'Last Purchase (Days Ago)', 'Support Tickets', 'Avg Order Value']]
y = data['Subscription Status']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]  # Probability of churn

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print performance metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC-ROC Score: {roc_auc:.2f}")

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Print detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

Accuracy: 0.98
Precision: 1.00
Recall: 0.97
F1 Score: 0.98
AUC-ROC Score: 0.99
Confusion Matrix:
 [[46  0]
 [ 3 98]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        46
           1       1.00      0.97      0.98       101

    accuracy                           0.98       147
   macro avg       0.97      0.99      0.98       147
weighted avg       0.98      0.98      0.98       147

# Example new customer input (ensure the order matches training features)
new_customer = pd.DataFrame([[10, 7, 9, 300]], columns=X.columns)
new_customer2 = pd.DataFrame([[2, 3, 15, 100]], columns=X.columns)

X = data[['Total Purchases', 'Last Purchase (Days Ago)', 'Support Tickets', 'Avg Order Value']]
# Standardize the new customer data
new_customer_scaled = scaler.transform(new_customer)
new_customer_scaled2 = scaler.transform(new_customer2)

# Predict churn probability
churn_probability = model.predict_proba(new_customer_scaled)[:, 0]
print(f"Churn Probability: {churn_probability[0]:.2f}")
# Predict churn probability
churn_probability2 = model.predict_proba(new_customer_scaled2)[:, 0]
print(f"Churn Probability2: {churn_probability2[0]:.2f}")

# Predict churn (1 = Churn, 0 = Stay)
churn_prediction = model.predict(new_customer_scaled)
print(f"Churn Prediction: {'Churn' if churn_prediction[0] == 1 else 'Stay'}")

# Predict churn (1 = Churn, 0 = Stay)
churn_prediction2 = model.predict(new_customer_scaled2)
print(f"Churn Prediction2: {'Churn' if churn_prediction2[0] == 1 else 'Stay'}")

Churn Probability: 0.16
Churn Probability2: 0.84
Churn Prediction: Churn
Churn Prediction2: Stay

# Generate a new DataFrame of 100 customers with random data
np.random.seed(42)  # For reproducibility

new_customers_df = pd.DataFrame({
    "Total Purchases": np.random.randint(1, 65, 100),  # Between 1 and 50 purchases
    "Last Purchase (Days Ago)": np.random.randint(1, 365, 100),  # Last purchase up to a year ago
    "Support Tickets": np.random.randint(0, 11, 100),  # 0 to 10 support tickets
    "Avg Order Value": np.random.uniform(10, 500, 100)  # Avg order value between $10 and $500
})

new_customers_df.head(5)

# Ensure we only use the original features
new_customers_features = new_customers_df[X.columns]  # Select only training features

# Standardize the new customer dataset
new_customers_scaled = scaler.transform(new_customers_features)

# Predict churn probabilities (using [:, 0] since 0 = Churn)
new_customers_df["Churn Probability"] = model.predict_proba(new_customers_scaled)[:, 0]

# Predict churn labels (1 = Stay, 0 = Churn)
new_customers_df["Churn Prediction"] = model.predict(new_customers_scaled)
new_customers_df["Churn Prediction"] = new_customers_df["Churn Prediction"].map({1: "Stay", 0: "Churn"})

new_customers_df.head(5)

import matplotlib.pyplot as plt

# Define risk threshold
risk_threshold = 0.5

# Separate high-risk and low-risk customers
high_risk = new_customers_df[new_customers_df["Churn Probability"] > risk_threshold]
low_risk = new_customers_df[new_customers_df["Churn Probability"] <= risk_threshold]

# Plot scatter plot with color differentiation
plt.figure(figsize=(10, 5))
plt.scatter(low_risk.index, low_risk["Churn Probability"], color="blue", alpha=0.7, label="Low Risk")
plt.scatter(high_risk.index, high_risk["Churn Probability"], color="red", alpha=0.7, label="High Risk (Churn Likely)")
plt.axhline(y=risk_threshold, color="black", linestyle="--", label=f"Risk Threshold ({risk_threshold})")

# Labels and title
plt.xlabel("Customer Index")
plt.ylabel("Churn Probability")
plt.title("Churn Probability of Customers (High-Risk Highlighted)")
plt.ylim(0, 1)  # Churn probabilities range from 0 to 1
plt.legend()
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.hist(new_customers_df["Churn Probability"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Churn Probability")
plt.ylabel("Number of Customers")
plt.title("Distribution of Predicted Churn Risk")
plt.show()

plt.figure(figsize=(6, 5))
plt.boxplot(new_customers_df["Churn Probability"], vert=True)
plt.ylabel("Churn Probability")
plt.title("Churn Probability Distribution")
plt.show()

# Select top 10 high-risk customers
high_risk_customers = new_customers_df.sort_values(by="Churn Probability", ascending=False).head(10)

# Plot
plt.figure(figsize=(10, 5))
plt.barh(high_risk_customers.index, high_risk_customers["Churn Probability"], color='red', alpha=0.7)
plt.xlabel("Churn Probability")
plt.ylabel("Customer Index")
plt.title("Top 10 High-Risk Customers")
plt.gca().invert_yaxis()  # Flip to show highest risk at the top
plt.show()
high_risk_customers.head(10)

high_risk_customers.describe()

from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

# Selecting two key features for visualization: "Total Purchases" and "Support Tickets"
X_selected = data[["Total Purchases", "Support Tickets"]]
y = data["Subscription Status"]

# Standardizing the selected features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Generate mesh grid for decision boundary visualization
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))

# Predict probability for each point in the mesh grid
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, levels=20, cmap="coolwarm", alpha=0.7)
plt.colorbar(label="Probability of Staying Active")

# Scatter plot of actual data points
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, edgecolors="k", cmap="bwr", alpha=0.8)

# Labels and title
plt.xlabel("Total Purchases (Standardized)")
plt.ylabel("Support Tickets (Standardized)")
plt.title("Logistic Regression Decision Boundary - Churn Prediction")
plt.show()

# Feature names
features = ["Total Purchases", "Last Purchase (Days Ago)", "Support Tickets", "Avg Order Value"]

# Coefficients from logistic regression output
coefficients = [0.56425893, -0.04388535, -0.91195298, -0.00283536]

# Assign colors based on coefficient sign
colors = ["green" if coef > 0 else "red" for coef in coefficients]

# Create bar chart
plt.figure(figsize=(8, 5))
plt.bar(features, coefficients, color=colors)
plt.axhline(0, color="black", linewidth=1, linestyle="--")  # Reference line at 0
plt.xlabel("Features")
plt.ylabel("Coefficient Value")
plt.title("Logistic Regression Feature Importance (Churn Prediction)")
plt.xticks(rotation=30, ha="right")

# Display the graph
plt.show()

# Count values
pi = data['Subscription Status'].value_counts()

# Plot pie chart
plt.pie(pi, labels=['Active','Non-Active'], autopct='%1.1f%%', pctdistance=0.6, 
        labeldistance=1.1, startangle=90)
plt.title('Active vs Canceled Subscriptions')
# Show the plot
plt.show()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                735 non-null    int64  
 1   Customer ID               735 non-null    int64  
 2   Total Purchases           735 non-null    int64  
 3   Last Purchase (Days Ago)  735 non-null    int64  
 4   Support Tickets           735 non-null    int64  
 5   Avg Order Value           735 non-null    float64
 6   Subscription Status       735 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 40.3 KB

active= data[data['Subscription Status'] == 1]
active.describe()

nonactive= data[data['Subscription Status'] == 0]
nonactive.describe()

#make a new DF with churn prediction outputs and make dashboard + airtables

import pandas as pd
#adding low/med/high risk column
new_customers_df["Risk Segment"] = pd.cut(
    new_customers_df["Churn Probability"],
    bins=[0, 0.3, 0.6, 1],
    labels=["Low Risk", "Medium Risk", "High Risk"]
)
new_customers_df.head(3)

# Compute average values for each risk level
risk_averages = new_customers_df.groupby("Risk Segment")[["Total Purchases", "Last Purchase (Days Ago)", "Support Tickets", "Avg Order Value"]].mean()


# Plot the comparison
plt.figure(figsize=(10, 6))
risk_averages.T.plot(kind="bar", figsize=(10, 6))
plt.xlabel("Customer Attributes")
plt.ylabel("Average Value")
plt.title("Comparison of Low, Medium, and High-Risk Customers")
plt.xticks(rotation=45)
plt.legend(title="Risk Level")
plt.show()

<ipython-input-64-a86348ae090d>:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  risk_averages = new_customers_df.groupby("Risk Segment")[["Total Purchases", "Last Purchase (Days Ago)", "Support Tickets", "Avg Order Value"]].mean()

<Figure size 1000x600 with 0 Axes>

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Use the Elbow Method to determine the optimal number of clusters
inertia = []
cluster_range = range(1, 10)  # Test clusters from 1 to 10

for k in cluster_range:
    kmeans_test = KMeans(n_clusters=k, random_state=42)
    kmeans_test.fit(new_customers_scaled)
    inertia.append(kmeans_test.inertia_)  # Inertia measures clustering quality

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertia, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal Clusters")
plt.show()

# Choose an optimal cluster count (e.g., 3 if the elbow method suggests it)
optimal_k = 3  # Change this based on your elbow method results

# Train K-Means with the optimal cluster count
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
new_customers_df["Customer Segment"] = kmeans.fit_predict(new_customers_scaled)

from sklearn.decomposition import PCA

# Reduce to 2D for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(new_customers_scaled)

# Plot the clusters
plt.figure(figsize=(8, 5))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=new_customers_df["Customer Segment"], cmap="viridis", alpha=0.7)
plt.xlabel("PCA Component 1 (spending behavior)")
plt.ylabel("PCA Component 2 (customer service dependency)")
plt.title("Customer Segments (K-Means Clustering)")
plt.colorbar(label="Cluster ID")
plt.show()

# Print how much each original feature contributes to each PCA component
pca_components = pd.DataFrame(pca.components_, columns=X.columns, index=[f"PC{i+1}" for i in range(pca.n_components_)])
print(pca_components)

     Total Purchases  Last Purchase (Days Ago)  Support Tickets  \
PC1         0.041337                 -0.029370         0.065743   
PC2        -0.002778                  0.155137         0.986043   

     Avg Order Value  
PC1         0.996547  
PC2        -0.060363

	Total Purchases	Last Purchase (Days Ago)	Support Tickets	Avg Order Value
0	39	191	10	491.262488
1	52	218	9	263.151587
2	29	44	6	137.806296
3	15	162	8	498.164313
4	43	202	6	483.055482

	Total Purchases	Last Purchase (Days Ago)	Support Tickets	Avg Order Value	Churn Probability	Churn Prediction
30	21	344	9	97.623128	0.879452	Churn
27	2	345	3	96.203374	0.864798	Churn
58	3	131	7	477.005205	0.850195	Churn
16	36	338	10	394.816919	0.831530	Churn
91	4	170	10	44.761049	0.831191	Churn
13	11	213	6	445.007348	0.791150	Churn
62	21	255	7	490.362476	0.788516	Churn
63	9	359	4	46.919665	0.780079	Churn
75	2	96	10	185.736923	0.775888	Churn
3	15	162	8	498.164313	0.741230	Churn

	Total Purchases	Last Purchase (Days Ago)	Support Tickets	Avg Order Value	Churn Probability
count	10.000000	10.000000	10.000000	10.000000	10.000000
mean	12.400000	241.300000	7.400000	277.660040	0.813403
std	11.057426	100.080024	2.503331	199.055141	0.044527
min	2.000000	96.000000	3.000000	44.761049	0.741230
25%	3.250000	164.000000	6.250000	96.558312	0.782188
50%	10.000000	234.000000	7.500000	290.276921	0.811170
75%	19.500000	342.500000	9.750000	469.005741	0.845529
max	36.000000	359.000000	10.000000	498.164313	0.879452

	Unnamed: 0	Customer ID	Total Purchases	Last Purchase (Days Ago)	Support Tickets	Avg Order Value	Subscription Status
count	490.000000	490.000000	490.000000	490.000000	490.000000	490.000000	490.0
mean	249.502041	1250.502041	50.440816	175.240816	4.381633	110.864224	1.0
std	144.294054	144.294054	28.949600	104.151686	2.894758	51.976864	0.0
min	0.000000	1001.000000	1.000000	1.000000	0.000000	20.190000	1.0
25%	125.250000	1126.250000	25.000000	85.250000	2.000000	65.452500	1.0
50%	250.500000	1251.500000	51.500000	178.000000	4.000000	114.050000	1.0
75%	373.750000	1374.750000	73.750000	254.750000	7.000000	155.292500	1.0
max	499.000000	1500.000000	99.000000	364.000000	9.000000	199.340000	1.0

	Unnamed: 0	Customer ID	Total Purchases	Last Purchase (Days Ago)	Support Tickets	Avg Order Value	Subscription Status
count	245.000000	245.000000	245.000000	245.000000	245.000000	245.000000	245.0
mean	251.767347	1252.767347	4.236735	284.216327	6.710204	144.443143	0.0
std	155.324195	155.324195	2.512485	44.648423	1.406111	47.201468	0.0
min	45.000000	1046.000000	1.000000	217.000000	5.000000	64.560000	0.0
25%	72.000000	1073.000000	2.000000	228.000000	5.000000	115.870000	0.0
50%	228.000000	1229.000000	3.000000	277.000000	7.000000	168.410000	0.0
75%	399.000000	1400.000000	6.000000	317.000000	8.000000	179.070000	0.0
max	481.000000	1482.000000	8.000000	350.000000	9.000000	193.180000	0.0

	Total Purchases	Last Purchase (Days Ago)	Support Tickets	Avg Order Value	Churn Probability	Churn Prediction
0	39	191	10	491.262488	0.227241	Stay
1	52	218	9	263.151587	0.010269	Stay
2	29	44	6	137.806296	0.001463	Stay
3	15	162	8	498.164313	0.741230	Churn
4	43	202	6	483.055482	0.021054	Stay