import json
import pandas as pd

path = "news_category_trainingdata.json"

with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Each top-level key is a column, each value is a dict of {row_index: value}
# Rebuild rows by zipping values across columns
df = pd.DataFrame({col: pd.Series(vals) for col, vals in data.items()})

# Reset index and drop NAs
df = df.reset_index(drop=True)
df = df.dropna(subset=["category", "headline"])

print(df.shape)
print(df.head())
print(df["category"].value_counts().head())

(200853, 6)
        category                                           headline  \
0          CRIME  There Were 2 Mass Shootings In Texas Last Week...   
1  ENTERTAINMENT  Will Smith Joins Diplo And Nicky Jam For The 2...   
2  ENTERTAINMENT    Hugh Grant Marries For The First Time At Age 57   
3  ENTERTAINMENT  Jim Carrey Blasts 'Castrato' Adam Schiff And D...   
4  ENTERTAINMENT  Julianna Margulies Uses Donald Trump Poop Bags...   

           authors                                               link  \
0  Melissa Jeltsen  https://www.huffingtonpost.com/entry/texas-ama...   
1    Andy McDonald  https://www.huffingtonpost.com/entry/will-smit...   
2       Ron Dicker  https://www.huffingtonpost.com/entry/hugh-gran...   
3       Ron Dicker  https://www.huffingtonpost.com/entry/jim-carre...   
4       Ron Dicker  https://www.huffingtonpost.com/entry/julianna-...   

                                   short_description        date  
0  She left her husband. He killed their children...  2018-05-26  
1                           Of course it has a song.  2018-05-26  
2  The actor and his longtime girlfriend Anna Ebe...  2018-05-26  
3  The actor gives Dems an ass-kicking for not fi...  2018-05-26  
4  The "Dietland" actress said using the bags is ...  2018-05-26  
category
POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
Name: count, dtype: int64

# Add Labels for Health/Wellness
health_cats = {"WELLNESS", "HEALTHY LIVING"}
df["label_str"] = df["category"].apply(lambda x: "Health" if x in health_cats else "Not_Health")
print(df["label_str"].value_counts())

label_str
Not_Health    176332
Health         24521
Name: count, dtype: int64

import ktrain
from ktrain import text
import warnings
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
warnings.filterwarnings("ignore")

#Build a stronger input field: headline + short_description
df["short_description"] = df["short_description"].fillna("")
df["text"] = (df["headline"].astype(str).str.strip()
              + " [SEP] "
              + df["short_description"].astype(str).str.strip())
# Clean/dedup
df = df.dropna(subset=["text", "label_str"])
df = df[df["text"].str.len() > 0].drop_duplicates(subset=["text"]).reset_index(drop=True)

# Class names must match labels
class_names = ["Health", "Not_Health"]

# Split into train/val and preprocess with BERT (96 tokens)
(x_train, y_train), (x_val, y_val), preproc = text.texts_from_array(
    x_train = df["text"].values,
    y_train = df["label_str"].values,
    class_names = class_names,
    val_pct = 0.2,
    maxlen = 96,
    preprocess_mode = "bert",
    random_state = 42
)

# Class-weighted loss to counter imbalance 
classes = np.arange(len(preproc.get_classes()))     # e.g., [0,1]
y_train_ids = y_train.argmax(1) if getattr(y_train, "ndim", 1) == 2 else y_train
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train_ids)
class_weight = {int(i): float(w) for i, w in zip(classes, cw)}
print("Class weights:", class_weight)

# Build classifier
model = text.text_classifier("bert", (x_train, y_train), preproc=preproc)

# Wrap in learner
learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_val, y_val), batch_size=96)

# Train (3 epochs usually enough for BERT on this dataset) (wont need early stoppage with only 3 epoch)
learner.fit_onecycle(2e-5, 3, class_weight=class_weight)

# Evaluate
actual_classes = preproc.get_classes()
print("ktrain class order:", actual_classes)
learner.validate(val_data=(x_val, y_val), class_names=actual_classes)

preprocessing train...
language: en

Is Multi-Label? False
preprocessing test...
language: en

task: text classification
Class weights: {0: 4.089916309450908, 1: 0.5696395064536305}
Is Multi-Label? False
maxlen is 96
done.


begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
5010/5010 [==============================] - 31246s 6s/step - loss: 0.2216 - accuracy: 0.9074 - val_loss: 0.2146 - val_accuracy: 0.9109
Epoch 2/3
5010/5010 [==============================] - 28989s 6s/step - loss: 0.1566 - accuracy: 0.9315 - val_loss: 0.1772 - val_accuracy: 0.9300
Epoch 3/3
5010/5010 [==============================] - 28980s 6s/step - loss: 0.0803 - accuracy: 0.9644 - val_loss: 0.1639 - val_accuracy: 0.9456
ktrain class order: ['Health', 'Not_Health']
1253/1253 [==============================] - 2123s 2s/step
              precision    recall  f1-score   support

      Health       0.72      0.90      0.80      4914
  Not_Health       0.99      0.95      0.97     35159

    accuracy                           0.95     40073
   macro avg       0.85      0.93      0.89     40073
weighted avg       0.95      0.95      0.95     40073

array([[ 4443,   471],
       [ 1708, 33451]], dtype=int64)

from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Use the tensors returned by texts_from_array
x_inputs = x_val                             # tuple/list of arrays for transformers
y_true = y_val.argmax(1) if y_val.ndim == 2 else y_val

# Model probabilities -> class ids
probs  = learner.model.predict(x_inputs, verbose=0)
y_pred = probs.argmax(1)

# Class names from the preprocessor (source of truth)
names = preproc.get_classes()
print(classification_report(y_true, y_pred, target_names=names, digits=4))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

      Health     0.7223    0.9042    0.8031      4914
  Not_Health     0.9861    0.9514    0.9685     35159

    accuracy                         0.9456     40073
   macro avg     0.8542    0.9278    0.8858     40073
weighted avg     0.9538    0.9456    0.9482     40073

[[ 4443   471]
 [ 1708 33451]]

# setting a threshold of 0.75 for both classes, rejecting the rest, rechecking validation
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.75):
    """
    Evaluate with a reject option using precomputed class probabilities.
    probs: np.ndarray, shape (N, 2)  -> model probabilities per class
    y_val: one-hot or int labels
    classes: list like ['Health', 'Not_Health']
    """
    # true labels as ints
    y_true = y_val.argmax(1) if getattr(y_val, "ndim", 1) == 2 else y_val

    # class indices
    h_idx = classes.index('Health')
    nh_idx = classes.index('Not_Health')

    p_health = probs[:, h_idx]
    p_not    = probs[:, nh_idx]

    # apply thresholds with reject (to not reduce recall)
    y_pred = np.full_like(y_true, fill_value=-1)
    y_pred[p_health >= threshold] = h_idx
    y_pred[(p_not >= threshold) & (y_pred == -1)] = nh_idx

    # keep confident only
    mask = (y_pred != -1)
    kept_true, kept_pred = y_true[mask], y_pred[mask]

    print(f"Threshold = {threshold}")
    print(f"Kept {mask.sum()}/{len(y_pred)} samples ({100*mask.mean():.1f}% coverage)\n")

    if mask.sum() == 0:
        print("No samples met the threshold.")
        return

    print("Classification Report (confident samples only):")
    print(classification_report(kept_true, kept_pred, target_names=classes, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(kept_true, kept_pred))

# get probs from the model (since x_val is tokenized tensors) 
probs = learner.model.predict(x_val, verbose=0)   # shape (N, 2)
classes = preproc.get_classes()                   # ['Health','Not_Health']

# run evaluation with reject threshold=0.75
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.75)

Threshold = 0.75
Kept 38738/40073 samples (96.7% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.7762    0.9253    0.8442      4603
  Not_Health     0.9897    0.9640    0.9767     34135

    accuracy                         0.9594     38738
   macro avg     0.8829    0.9446    0.9104     38738
weighted avg     0.9643    0.9594    0.9609     38738

Confusion Matrix:
[[ 4259   344]
 [ 1228 32907]]

print("                       run evaluation with reject threshold=0.80")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.80)
print("                       run evaluation with reject threshold=0.85")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.85)
print("                       run evaluation with reject threshold=0.90")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.90)
print("                       run evaluation with reject threshold=0.95")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.95)

print("                       run evaluation with reject threshold 0.97")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.97)
print("                       run evaluation with reject threshold 0.98")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.98)
print("                       run evaluation with reject threshold 0.99")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.99)

                       run evaluation with reject threshold=0.80
Threshold = 0.8
Kept 38374/40073 samples (95.8% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.7909    0.9297    0.8547      4509
  Not_Health     0.9904    0.9673    0.9787     33865

    accuracy                         0.9629     38374
   macro avg     0.8907    0.9485    0.9167     38374
weighted avg     0.9670    0.9629    0.9641     38374

Confusion Matrix:
[[ 4192   317]
 [ 1108 32757]]
                       run evaluation with reject threshold=0.85
Threshold = 0.85
Kept 37948/40073 samples (94.7% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.8074    0.9339    0.8661      4400
  Not_Health     0.9911    0.9708    0.9809     33548

    accuracy                         0.9665     37948
   macro avg     0.8993    0.9523    0.9235     37948
weighted avg     0.9698    0.9665    0.9675     37948

Confusion Matrix:
[[ 4109   291]
 [  980 32568]]
                       run evaluation with reject threshold=0.90
Threshold = 0.9
Kept 37342/40073 samples (93.2% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.8289    0.9387    0.8804      4228
  Not_Health     0.9920    0.9753    0.9836     33114

    accuracy                         0.9711     37342
   macro avg     0.9105    0.9570    0.9320     37342
weighted avg     0.9736    0.9711    0.9719     37342

Confusion Matrix:
[[ 3969   259]
 [  819 32295]]
                       run evaluation with reject threshold=0.95
Threshold = 0.95
Kept 36292/40073 samples (90.6% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.8641    0.9428    0.9017      3917
  Not_Health     0.9930    0.9821    0.9875     32375

    accuracy                         0.9778     36292
   macro avg     0.9285    0.9624    0.9446     36292
weighted avg     0.9791    0.9778    0.9782     36292

Confusion Matrix:
[[ 3693   224]
 [  581 31794]]
                       run evaluation with reject threshold 0.97
Threshold = 0.97
Kept 35422/40073 samples (88.4% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.8896    0.9481    0.9179      3621
  Not_Health     0.9940    0.9866    0.9903     31801

    accuracy                         0.9827     35422
   macro avg     0.9418    0.9673    0.9541     35422
weighted avg     0.9834    0.9827    0.9829     35422

Confusion Matrix:
[[ 3433   188]
 [  426 31375]]
                       run evaluation with reject threshold 0.98
Threshold = 0.98
Kept 34607/40073 samples (86.4% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.9096    0.9507    0.9297      3324
  Not_Health     0.9947    0.9900    0.9923     31283

    accuracy                         0.9862     34607
   macro avg     0.9522    0.9703    0.9610     34607
weighted avg     0.9866    0.9862    0.9863     34607

Confusion Matrix:
[[ 3160   164]
 [  314 30969]]
                       run evaluation with reject threshold 0.99
Threshold = 0.99
Kept 33082/40073 samples (82.6% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.9435    0.9487    0.9461      2709
  Not_Health     0.9954    0.9949    0.9952     30373

    accuracy                         0.9911     33082
   macro avg     0.9694    0.9718    0.9706     33082
weighted avg     0.9912    0.9911    0.9912     33082

Confusion Matrix:
[[ 2570   139]
 [  154 30219]]

print("                       run evaluation with reject threshold 0.05")
evaluate_with_reject_from_probs(probs, y_val, classes, threshold=0.05)

                       run evaluation with reject threshold 0.05
Threshold = 0.05
Kept 40073/40073 samples (100.0% coverage)

Classification Report (confident samples only):
              precision    recall  f1-score   support

      Health     0.5822    0.9544    0.7233      4914
  Not_Health     0.9930    0.9043    0.9466     35159

    accuracy                         0.9104     40073
   macro avg     0.7876    0.9294    0.8349     40073
weighted avg     0.9426    0.9104    0.9192     40073

Confusion Matrix:
[[ 4690   224]
 [ 3365 31794]]

first_description = df['short_description'].iloc[2]
first_headline = df['headline'].iloc[2]
print(first_headline)
print(first_description)

Hugh Grant Marries For The First Time At Age 57
The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.

Contextual Advertising: Supervised Text Classification¶

Dataset¶

Methodology¶

Model¶

Class Imbalance¶

Training Strategy¶

Evaluation¶

Threshold Tuning: Waste Less Ad-Dollars Vs Missed Opportunities¶

Results¶

Threshold¶

Classification Report & Confusion Matrix¶

Takeaway¶

Future Research & Improvements¶

Class	Precision	Recall	F1-Score	Support
Health	0.58	0.95	0.72	4914
Not_Health	0.99	0.90	0.95	35159
Accuracy			0.91	40073
Macro Avg	0.79	0.9393	0.83	40073
Weighted Avg	0.94	0.91	0.92	40073

Class	Precision	Recall	F1-Score	Support
Health	0.8896	0.9481	0.9179	3621
Not_Health	0.9940	0.9866	0.9903	31801
Accuracy			0.9827	35422
Macro Avg	0.9418	0.9673	0.9541	35422
Weighted Avg	0.9834	0.9827	0.9829	35422

	Pred Health	Pred Not_Health
True Health	4690	224
True Not_Health	3365	31794

	Pred Health	Pred Not_Health
True Health	4443	471
True Not_Health	1708	33451

	Pred Health	Pred Not_Health
True Health	3433	188
True Not_Health	426	31375