import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("scam dataset.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Scam_Keywords        500 non-null    int64  
 1   Job_Duration_Days    500 non-null    int64  
 2   Response_Time_Hours  500 non-null    int64  
 3   Response_Type        500 non-null    object 
 4   Email_Length_Words   424 non-null    float64
 5   Offered_Interview    500 non-null    int64  
 6   Interview_Type       178 non-null    object 
 7   Is_Spam              500 non-null    int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 31.4+ KB
None

df = df.fillna(0)
df.head(10)

# isolate is_spam and take a look
spam = df[df['Is_Spam'] == 1]
print(f'number of spam posts: {len(spam)}')
print(f'proportion of scam posts: {(len(spam)/len(df)) * 100}%')

number of spam posts: 85
proportion of scam posts: 17.0%

response_type_scam = spam.groupby('Response_Type').count()
response_type_scam

el = 44/85 * 100
tt = 41/85 * 100

szs = [el,tt]
lbl = ["Email", "Text"]

plt.pie(szs,labels=lbl, autopct='%1.1f%%')
plt.title("Scam Response types")
plt.show()

nonspam = df[df['Is_Spam'] == 0]
response_type_noscam = nonspam.groupby('Response_Type').count()
response_type_noscam

response_type_noscam = nonspam.groupby('Response_Type').count()
response_type_noscam

e=380/(500-85)
p=25/(500-85)
t=10/(500-85)

vlus = [e,p,t]
lbels=['Email', "Phone Call", "Text"]

plt.pie(vlus,labels=lbels,autopct='%1.1f%%')
plt.title("Non-Scam Response Types")
plt.show()

emaillegit = nonspam[nonspam['Response_Type'] == "Email"]
emailspam = spam[spam['Response_Type'] == "Email"]

print(emaillegit["Email_Length_Words"].describe())
print(emailspam["Email_Length_Words"].describe())

count    380.000000
mean     357.542105
std      164.108452
min       24.000000
25%      221.500000
50%      386.000000
75%      498.500000
max      599.000000
Name: Email_Length_Words, dtype: float64
count     44.000000
mean     360.886364
std      139.511476
min       36.000000
25%      268.250000
50%      368.500000
75%      465.250000
max      599.000000
Name: Email_Length_Words, dtype: float64

spam.describe()

nonspam.describe()

interview = spam.groupby('Interview_Type').count()
interview

#out of 85 spam posts, 28 are chatbox meetings, 42 were not offered an interview, 6 were offered in person interviews, and 9 were offered video call interviews
chaty = spam[spam['Interview_Type'] == 'Chat Box Meeting']
videoy = spam[spam['Interview_Type'] == 'Video Call']
persony = spam[spam['Interview_Type'] == 'In-Person']
noney = spam[spam['Interview_Type'] == 0]

print(f'Chat Box meetings are {round(len(chaty)/85,3)*100}% of scams')
print(f'Video Meetings meetings are {round(len(videoy)/85,3)*100}% of scams')
print(f'In-Person meetings are {round(len(persony)/85,3)*100}% of scams')
print(f'Cases where no interviews were offered are {round(len(noney)/85,3)*100}% of scams')

chatper= round(len(chaty)/85,3)*100
videoper=round(len(videoy)/85,3)*100
perper=round(len(persony)/85,3)*100
noper=round(len(noney)/85,3)*100

labels = ['Chat Box Meeting', 'Video Call', 'In-Person', 'No Interview']
sizes = [chatper,videoper,perper,noper]

plt.pie(sizes,labels=labels, autopct='%1.1f%%', startangle=140)
plt.title("Interview Types for spam posts")
plt.show

Chat Box meetings are 32.9% of scams
Video Meetings meetings are 10.6% of scams
In-Person meetings are 7.1% of scams
Cases where no interviews were offered are 49.4% of scams

<function matplotlib.pyplot.show(close=None, block=None)>

interview2 = nonspam.groupby('Interview_Type').count()
interview2

chatno = nonspam[nonspam['Interview_Type'] == 'Chat Box Meeting']
videono = nonspam[nonspam['Interview_Type'] == 'Video Call']
personno = nonspam[nonspam['Interview_Type'] == 'In-Person']
noneno = nonspam[nonspam['Interview_Type'] == 0]

print(f'Chat Box meetings are {round(len(chatno)/500,3)*100}% of non scams')
print(f'Video Meetings meetings are {round(len(videono)/500,3)*100}% of non scams')
print(f'In-Person meetings are {round(len(personno)/500,3)*100}% of non scams')
print(f'Cases where no interviews were offered are {round(len(noneno)/500,3)*100}% of non scams')

nonchatper= round(len(chatno)/500,3)*100
nonvideoper=round(len(videono)/500,3)*100
nonperper=round(len(personno)/500,3)*100
nonnoper=round(len(noneno)/500,3)*100

labels = ['Chat Box Meeting', 'Video Call', 'In-Person', 'No Interview']
sizes = [nonchatper,nonvideoper,nonperper,nonnoper]

plt.style.use('ggplot')
plt.pie(sizes,labels=labels, autopct='%1.1f%%', startangle=140)
plt.title("Interview Types for non-spam posts")
plt.show

Chat Box meetings are 2.8000000000000003% of non scams
Video Meetings meetings are 11.4% of non scams
In-Person meetings are 12.8% of non scams
Cases where no interviews were offered are 56.00000000000001% of non scams

<function matplotlib.pyplot.show(close=None, block=None)>

#lets see the unique types of data so we can manually encode it easily
print(f"unique response types: {df['Response_Type'].unique().tolist()}")
print(f"unique interview types: {df['Interview_Type'].unique().tolist()}")

unique response types: ['Email', 'Text', 'Phone Call']
unique interview types: [0, 'In-Person', 'Chat Box Meeting', 'Video Call']

#encode the strings to prep for regression model

df["Interview_Type"] = df["Interview_Type"].apply(
    lambda x:   1 if x== 'In-Person'  else
                2 if x== 'Video Call'  else
                3 if x== 'Chat Box Meeting'  else 0)

df["Response_Type"] = df["Response_Type"].apply(
    lambda x:   1 if  x== 'Email' else
                2 if  x== 'Phone Call' else
                3 if  x== 'Text' else 0)
df.head(10)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# X is our features, Y is our outcome variable
X = df.drop(columns='Is_Spam',)
y = df['Is_Spam']

# we will use 20% of data for test and 80% for training.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# training the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Spam', 'Spam'])

# Display the results
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

    Not Spam       0.95      0.93      0.94        83
        Spam       0.68      0.76      0.72        17

    accuracy                           0.90       100
   macro avg       0.82      0.85      0.83       100
weighted avg       0.91      0.90      0.90       100

# lets graph the X feature coefficients to see the impact each had on the model
import numpy as np

coefficients = model.coef_[0]
features = ["Scam_Keywords",	"Job_Duration_Days",	"Response_Time_Hours",	"Response_Type",	"Email_Length_Words",	"Offered_Interview",	"Interview_Type",] # Replace with your feature names

plt.figure(figsize=(8, 6))
plt.barh(features, coefficients)
plt.xlabel('Coefficient Value')
plt.title('Regression Coefficients (Strength of Predictor Values)')
plt.show()


feature_names = np.array(X_train.columns)
coefficients = model.coef_[0]

for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.4f}")

Scam_Keywords: 3.2572
Job_Duration_Days: 0.0089
Response_Time_Hours: -0.2050
Response_Type: 2.8700
Email_Length_Words: 0.0003
Offered_Interview: -1.1683
Interview_Type: 1.5752

# Manually insert the details of a new job application
new_post = {
    "Scam_Keywords": 1,  #Yes
    "Job_Duration_Days": 60,  #Days posted
    "Response_Time_Hours": 2,  #Hours it took to respond
    "Response_Type": 3,  # Text
    "Email_Length_Words": 0,  #Response was not email, so 0
    "Offered_Interview": 1,  #Yes
    "Interview_Type": 3  #Chat Box Meeting
}

new_post_df = pd.DataFrame([new_post])

# Get prediction (1 = Scam, 0 = Not Scam)
prediction = model.predict(new_post_df)[0]

# Output
if prediction == 1:
    print("⚠️ This job listing is predicted to be a SCAM.")
else:
    print("✅ This job listing looks LEGITIMATE.")

# Get probability predictions for each class
# The output is an array: [probability_of_not_spam, probability_of_spam]
y_prob = model.predict_proba(new_post_df)

#probability that the listing is a scam
scam_probability = y_prob[0, 1]  # index 1 is the probability for class '1' (scam)
print(f"Probability of being a scam: {scam_probability*100:.2f}%")

⚠️ This job listing is predicted to be a SCAM.
Probability of being a scam: 99.99%

from sklearn.preprocessing import StandardScaler

# X is our features, Y is our outcome variable
X = df.drop(columns=['Is_Spam','Job_Duration_Days'])
y = df['Is_Spam']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# we will use 20% of data for test and 80% for training.
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled,y, test_size=0.2, random_state=42)

# training the model
model = LogisticRegression(max_iter=1000)
model.fit(X_scaled_train, y_train)

# Make predictions
y_pred = model.predict(X_scaled_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Not Spam', 'Spam'])

# Display the results
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

    Not Spam       0.95      0.93      0.94        83
        Spam       0.68      0.76      0.72        17

    accuracy                           0.90       100
   macro avg       0.82      0.85      0.83       100
weighted avg       0.91      0.90      0.90       100

Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

    Not Spam       0.95      0.93      0.94        83
        Spam       0.68      0.76      0.72        17

    accuracy                           0.90       100
   macro avg       0.82      0.85      0.83       100
weighted avg       0.91      0.90      0.90       100

from sklearn.feature_selection import f_classif
import pandas as pd

# Compute ANOVA F-statistic and p-values
f_values, p_values = f_classif(X_scaled_train, y_train)

# Create a DataFrame to display results
anova_results = pd.DataFrame({"Feature": df.drop(columns="Is_Spam").columns, "F-Value": f_values, "P-Value": p_values})
anova_results = anova_results.sort_values(by="P-Value")  # Sort by significance

print(anova_results)

               Feature     F-Value       P-Value
3        Response_Type  109.467213  8.540187e-23
0        Scam_Keywords   75.114239  1.133095e-16
6       Interview_Type   35.004465  7.097162e-09
4   Email_Length_Words   22.079253  3.609641e-06
2  Response_Time_Hours   22.037037  3.685636e-06
5    Offered_Interview   10.076224  1.618733e-03
1    Job_Duration_Days    0.114729  7.350015e-01

# lets graph the X feature coefficients to see the impact each had on the model
import numpy as np

coefficients = model.coef_[0]
features = ["Scam_Keywords",	"Job_Duration_Days",	"Response_Time_Hours",	"Response_Type",	"Email_Length_Words",	"Offered_Interview",	"Interview_Type",] # Replace with your feature names

plt.figure(figsize=(8, 6))
plt.barh(features, coefficients)
plt.xlabel('Coefficient Value')
plt.title('Regression Coefficients (Strength of Predictor Values)')
plt.show()


feature_names = np.array(X_train.columns)
coefficients = model.coef_[0]

for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.4f}")

Scam_Keywords: 1.9435
Job_Duration_Days: 0.2784
Response_Time_Hours: -1.5118
Response_Type: 2.2343
Email_Length_Words: 0.1956
Offered_Interview: -0.8774
Interview_Type: 1.9833

	Scam_Keywords	Job_Duration_Days	Response_Time_Hours	Email_Length_Words	Offered_Interview	Is_Spam
count	85.000000	85.000000	85.000000	85.000000	85.000000	85.0
mean	0.670588	47.152941	9.023529	186.811765	0.505882	1.0
std	0.472789	25.100989	7.327340	207.050114	0.502933	0.0
min	0.000000	4.000000	1.000000	0.000000	0.000000	1.0
25%	0.000000	27.000000	3.000000	0.000000	0.000000	1.0
50%	1.000000	46.000000	6.000000	57.000000	1.000000	1.0
75%	1.000000	66.000000	16.000000	369.000000	1.000000	1.0
max	1.000000	88.000000	23.000000	599.000000	1.000000	1.0

	Scam_Keywords	Job_Duration_Days	Response_Time_Hours	Email_Length_Words	Offered_Interview	Is_Spam
count	415.000000	415.000000	415.000000	415.000000	415.000000	415.0
mean	0.243373	42.971084	12.759036	327.387952	0.325301	0.0
std	0.429637	24.825266	6.468411	185.878217	0.469053	0.0
min	0.000000	1.000000	1.000000	0.000000	0.000000	0.0
25%	0.000000	21.000000	7.000000	172.500000	0.000000	0.0
50%	0.000000	43.000000	13.000000	346.000000	0.000000	0.0
75%	0.000000	64.000000	18.000000	489.000000	1.000000	0.0
max	1.000000	89.000000	23.000000	599.000000	1.000000	0.0

Logistic Regression to Identify Scam/Spam Job Listings¶

Lets look at the response type for scam cases¶

and non spam cases:¶

Conclusions for response type:¶

Now I will investigate the interview_type to see which leads to more scams.¶

Accuracy¶

Not Spam Precision¶

Spam Precision¶

Spam Recall¶

	Scam_Keywords	Job_Duration_Days	Response_Time_Hours	Response_Type	Email_Length_Words	Offered_Interview	Interview_Type	Is_Spam
0	1	41	22	Email	248.0	0	0	0
1	0	63	9	Email	109.0	1	In-Person	0
2	0	23	17	Email	580.0	1	Chat Box Meeting	0
3	0	75	12	Email	465.0	1	Video Call	0
4	1	48	7	Email	329.0	0	0	0
5	1	32	10	Email	169.0	1	Video Call	0
6	1	57	10	Text	0.0	0	0	1
7	0	33	13	Email	151.0	1	Chat Box Meeting	0
8	0	24	19	Email	452.0	0	0	0
9	1	39	11	Email	505.0	1	In-Person	0

	Scam_Keywords	Job_Duration_Days	Response_Time_Hours	Email_Length_Words	Offered_Interview	Interview_Type	Is_Spam
Response_Type
Email	380	380	380	380	380	380	380
Phone Call	25	25	25	25	25	25	25
Text	10	10	10	10	10	10	10

	Scam_Keywords	Job_Duration_Days	Response_Time_Hours	Response_Type	Email_Length_Words	Offered_Interview	Is_Spam
Interview_Type
0	42	42	42	42	42	42	42
Chat Box Meeting	28	28	28	28	28	28	28
In-Person	6	6	6	6	6	6	6
Video Call	9	9	9	9	9	9	9