import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import roc_curve

import warnings
warnings.filterwarnings('ignore')

# filenames
f1 = 'part-00000-5b4f5c3f-e8a9-4020-8fa1-e8985f7c27f3-c000.csv'
f2 = 'part-00000-95e0a460-e7c5-4b35-8367-c2e6fbbcf9e1-c000.csv'

df = pd.concat([pd.read_csv(f1), pd.read_csv(f2)], ignore_index=True)
print(df.shape)  
df.columns

(2000981, 23)

Index(['resp_pkts', 'service', 'orig_ip_bytes', 'local_resp', 'missed_bytes',
       'protocol', 'duration', 'conn_state', 'dest_ip', 'orig_pkts',
       'community_id', 'resp_ip_bytes', 'dest_port', 'orig_bytes',
       'local_orig', 'datetime', 'history', 'resp_bytes', 'uid', 'src_port',
       'ts', 'src_ip', 'mitre_attack_tactics'],
      dtype='object')

df.isnull().sum()

resp_pkts                    0
service                  48335
orig_ip_bytes                0
local_resp                   0
missed_bytes                 0
protocol                     0
duration                132372
conn_state                   0
dest_ip                      0
orig_pkts                    0
community_id                 0
resp_ip_bytes                0
dest_port                    0
orig_bytes              132372
local_orig                   0
datetime                     0
history                  18267
resp_bytes              132372
uid                          0
src_port                     0
ts                           0
src_ip                       0
mitre_attack_tactics         0
dtype: int64

df.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
df.dropna(inplace=True)

# Binary label: 1 = attack, 0 = benign
df['label'] = df['mitre_attack_tactics'].apply(lambda x: 1 if x == 'Reconnaissance' else 0)

# Preview distribution
print(df['mitre_attack_tactics'].unique())
print(df['label'].value_counts())

['Reconnaissance' 'none']
label
0    1432660
1     414720
Name: count, dtype: int64

# Visualize durations
sns.histplot(data=df, x='duration', hue='label', log_scale=True)

# Byte distributions
sns.boxplot(data=df, x='label', y='orig_bytes')
sns.boxplot(data=df, x='label', y='resp_bytes')

# Protocol usage
print(df.groupby('protocol')['label'].value_counts(normalize=True).unstack())

label            0         1
protocol                    
tcp       1.000000       NaN
udp       0.775509  0.224491

# Add log-transformed versions of skewed features
for feature in numerical_features:
    df[f'log1p_{feature}'] = np.log1p(df[feature])

# Plot violin plots for log-transformed features
log_features = [f'log1p_{f}' for f in numerical_features]

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12))
axes = axes.flatten()

for i, feature in enumerate(log_features):
    sns.violinplot(data=df, x='label', y=feature, ax=axes[i], inner='quartile', linewidth=1)
    axes[i].set_title(f'Log Distribution of {feature[6:]} by Label')  # Strip 'log1p_' prefix

# Remove empty subplot if needed
fig.delaxes(axes[-1])
plt.tight_layout()
plt.show()

# Protocol usage by label
protocol_dist = df.groupby('protocol')['label'].value_counts(normalize=True).unstack()
protocol_dist.plot(kind='bar', stacked=True, figsize=(8, 4))
plt.title('Protocol Usage by Label')
plt.ylabel('Proportion')
plt.xlabel('Protocol')
plt.legend(title='Label (0=Benign, 1=Recon)')
plt.tight_layout()
plt.show()

# Connection state usage by label
conn_state_dist = df.groupby('conn_state')['label'].value_counts(normalize=True).unstack()
conn_state_dist.plot(kind='bar', stacked=True, figsize=(8, 4))
plt.title('Connection State Usage by Label')
plt.ylabel('Proportion')
plt.xlabel('Connection State')
plt.legend(title='Label (0=Benign, 1=Recon)')
plt.tight_layout()
plt.show()

# Network services by label
top_services = df['service'].value_counts().head(10).index
service_dist = df[df['service'].isin(top_services)].groupby('service')['label'].value_counts(normalize=True).unstack()
service_dist.plot(kind='bar', stacked=True, figsize=(8, 4))
plt.title('Network Services by Label')
plt.ylabel('Proportion')
plt.xlabel('Service')
plt.legend(title='Label (0=Benign, 1=Recon)')
plt.tight_layout()
plt.show()

corr = df[numerical_features + ['label']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=False, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# One-hot encode protocol, conn_state, service
df = pd.get_dummies(df, columns=['protocol', 'conn_state', 'service'], drop_first=True)

features = [
    'duration', 'orig_bytes', 'resp_bytes',
    'orig_pkts', 'resp_pkts', 'src_port', 'dest_port',
    'protocol_udp', 'conn_state_SF', 'conn_state_SHR',
    'service_dns', 'service_ntp', 'service_ssl'
]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Scale for models that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), use_label_encoder=False, eval_metric='logloss'),

}

# Train and evaluate
results = []
best_model_name = None
best_auc = 0
best_y_prob = None

for name, model in models.items():
    print(f"\n- {name} -")
    
    # Use scaled data for models that require it
    if name in ["Logistic Regression", "SVM", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

    print(classification_report(y_test, y_pred))
    auc = roc_auc_score(y_test, y_prob)
    if auc > best_auc:
        best_auc = auc
        best_model_name = name
        best_y_prob = y_prob
    print(f"ROC AUC: {auc:.4f}")

    results.append({
        "Model": name,
        "ROC AUC": auc,
        "Precision": classification_report(y_test, y_pred, output_dict=True)['1']['precision'],
        "Recall": classification_report(y_test, y_pred, output_dict=True)['1']['recall'],
        "F1 Score": classification_report(y_test, y_pred, output_dict=True)['1']['f1-score']
    })

# Show summary comparison table
comparison_df = pd.DataFrame(results).sort_values(by="ROC AUC", ascending=False)
print("\n Model Comparison:")
print(comparison_df)

fpr, tpr, _ = roc_curve(y_test, best_y_prob)
plt.plot(fpr, tpr, label=f'Best Model ROC: ({best_model_name})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Best Performing Model')
plt.legend()
plt.show

- Logistic Regression -
              precision    recall  f1-score   support

           0       1.00      0.95      0.98    358165
           1       0.86      1.00      0.92    103680

    accuracy                           0.96    461845
   macro avg       0.93      0.98      0.95    461845
weighted avg       0.97      0.96      0.96    461845

ROC AUC: 0.9910

- Random Forest -
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    358165
           1       1.00      1.00      1.00    103680

    accuracy                           1.00    461845
   macro avg       1.00      1.00      1.00    461845
weighted avg       1.00      1.00      1.00    461845

ROC AUC: 0.9999

- XGBoost -
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    358165
           1       0.98      1.00      0.99    103680

    accuracy                           0.99    461845
   macro avg       0.99      1.00      0.99    461845
weighted avg       0.99      0.99      0.99    461845

ROC AUC: 0.9996

 Model Comparison:
                 Model   ROC AUC  Precision  Recall  F1 Score
1        Random Forest  0.999942   0.999085     1.0  0.999542
2              XGBoost  0.999575   0.976519     1.0  0.988120
0  Logistic Regression  0.991035   0.858555     1.0  0.923895

<function matplotlib.pyplot.show(close=None, block=None)>

Column	Meaning	Usage
duration	Duration of the connection (seconds)	Used in features list
orig_bytes	Bytes sent from source to destination	Used in features list
resp_bytes	Bytes sent from destination to source	Used in features list
orig_pkts	Packets sent from source to destination	Used in features list
resp_pkts	Packets sent from destination to source	Used in features list
src_ip, dest_ip	IPs involved in the connection	Not used in features list
src_port, dest_port	Network ports used	Used in features list
service	Application-layer protocol identified (e.g., HTTP, FTP)	Used in features list (encoded)
protocol	Transport-layer protocol (TCP/UDP)	Used in features list (encoded)
conn_state	Zeek connection state (S0, SF, REJ, etc.)	Used in features list (encoded)
history	Sequence of connection flags (SYNs, acks, etc.)	Not used in features list
mitre_attack_tactics	'none' for benign, 'Reconnaissance' for attack	Used to make 'label'
datetime, ts	Timestamp information	Not Used in features list
label	0 for benigin (mitre_attack_tactics = 'none'), 1 for attack (mitre_attack_tactics = 'Reconnaissance')	Y variable

Augmenting Zeek-Based Detection with Machine Learning to Identify Beaconing and C2 Recon Traffic¶

Dataset¶

Column Explanations (Relevant for EDA & Modeling)¶

Methods & Results Discussion¶

Reconnaissance typically includes:¶

Zeek Logs Indicating Recon Activity in this dataset:¶

Protocol Usage:¶

Connection States:¶

Services:¶

Summary:¶