import warnings
warnings.filterwarnings('ignore')

import re
import string
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import skew

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD

sns.set_theme(style='whitegrid', context='notebook')
pd.set_option('display.max_colwidth', 120)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


# Data load with safety checks for robust execution
data_path = Path('../data/flipitnews-data.csv')
if not data_path.exists():
    raise FileNotFoundError(f'Dataset not found at: {data_path.resolve()}')

df = pd.read_csv(data_path)
print(f'Shape: {df.shape}')
df.head(3)

Shape: (2225, 2)


schema_df = pd.DataFrame({
    'column': df.columns,
    'dtype': [str(dt) for dt in df.dtypes.values],
    'missing_count': [df[col].isna().sum() for col in df.columns],
    'missing_pct': [df[col].isna().mean() * 100 for col in df.columns],
    'n_unique': [df[col].nunique(dropna=True) for col in df.columns]
})

dup_total = df.duplicated().sum()
dup_article = df.duplicated(subset=['Article']).sum()

print(f'Total rows: {len(df):,}')
print(f'Exact duplicate rows: {dup_total:,}')
print(f'Duplicate articles (by text): {dup_article:,}')
schema_df

Total rows: 2,225
Exact duplicate rows: 99
Duplicate articles (by text): 99


# Univariate: category distribution
cat_counts = df['Category'].value_counts().sort_values(ascending=False)
display(cat_counts.to_frame('count'))

plt.figure(figsize=(8, 4))
sns.barplot(x=cat_counts.index, y=cat_counts.values, palette='viridis')
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Article Count')
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()


# Text-derived numeric features for deeper EDA
eda_df = df.copy()
eda_df['article_char_count'] = eda_df['Article'].fillna('').astype(str).str.len()
eda_df['article_word_count'] = eda_df['Article'].fillna('').astype(str).str.split().str.len()
eda_df['article_sentence_count'] = eda_df['Article'].fillna('').astype(str).str.count(r'[.!?]+') + 1
eda_df['article_avg_word_len'] = eda_df['article_char_count'] / eda_df['article_word_count'].replace(0, np.nan)
eda_df['article_unique_ratio'] = eda_df['Article'].fillna('').astype(str).apply(
    lambda x: (len(set(x.lower().split())) / max(len(x.split()), 1))
)

num_cols = ['article_char_count', 'article_word_count', 'article_sentence_count', 'article_avg_word_len', 'article_unique_ratio']
display(eda_df[num_cols].describe().T)

skew_df = pd.DataFrame({
    'feature': num_cols,
    'skewness': [skew(eda_df[col].dropna()) for col in num_cols]
}).sort_values('skewness', key=lambda s: s.abs(), ascending=False)
display(skew_df)

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
sns.histplot(eda_df['article_word_count'], kde=True, ax=axes[0, 0], color='#2A9D8F')
axes[0, 0].set_title('Word Count Distribution')

sns.histplot(eda_df['article_char_count'], kde=True, ax=axes[0, 1], color='#264653')
axes[0, 1].set_title('Character Count Distribution')

sns.boxplot(x=eda_df['article_word_count'], ax=axes[1, 0], color='#E9C46A')
axes[1, 0].set_title('Word Count Outlier View')

sns.boxplot(x=eda_df['article_char_count'], ax=axes[1, 1], color='#F4A261')
axes[1, 1].set_title('Character Count Outlier View')

plt.tight_layout()
plt.show()

# IQR-based outlier counts for operational risk visibility
def iqr_outlier_count(series):
    q1, q3 = np.percentile(series.dropna(), [25, 75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    return int(((series < lower) | (series > upper)).sum()), lower, upper

for c in ['article_word_count', 'article_char_count']:
    cnt, lo, hi = iqr_outlier_count(eda_df[c])
    print(f'{c}: outliers={cnt:,}, lower={lo:.2f}, upper={hi:.2f}')

article_word_count: outliers=66, lower=-93.50, upper=822.50
article_char_count: outliers=62, lower=-588.00, upper=4836.00


# Bivariate + multivariate analysis
cat_len_stats = (
    eda_df.groupby('Category')['article_word_count']
    .agg(['count', 'mean', 'median', 'std', 'min', 'max'])
    .sort_values('mean', ascending=False)
)
display(cat_len_stats)

plt.figure(figsize=(10, 4))
sns.boxplot(data=eda_df, x='Category', y='article_word_count', palette='Set2')
plt.title('Bivariate: Word Count by Category')
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()

corr = eda_df[num_cols].corr(numeric_only=True)
plt.figure(figsize=(7, 5))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='YlGnBu')
plt.title('Multivariate Correlation: Text-Derived Features')
plt.tight_layout()
plt.show()


# Data quality treatments
model_df = df.copy()
model_df = model_df.dropna(subset=['Article', 'Category']).copy()
model_df = model_df.drop_duplicates(subset=['Article', 'Category']).copy()
model_df['Article'] = model_df['Article'].astype(str)
model_df['Category'] = model_df['Category'].astype(str)

# Lightweight text normalization (kept intentionally simple and reproducible)
def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\b\w{1,2}\b', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

model_df['Article_clean'] = model_df['Article'].map(normalize_text)

print(f'Rows retained for modeling: {len(model_df):,}')
model_df[['Category', 'Article', 'Article_clean']].head(2)

Rows retained for modeling: 2,126


# Train/Validation/Test split (leakage-safe)
X = model_df['Article_clean']
y = model_df['Category']

# Required 75:25 split for questionnaire and holdout test
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

# Validation split from training fold only
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.20, random_state=RANDOM_STATE, stratify=y_train_full
)

print('Split shapes (75:25 first split):')
print(f'X_train_full: {X_train_full.shape}, X_test: {X_test.shape}')
print('Final modeling split (train/val/test):')
print(f'X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}')

Split shapes (75:25 first split):
X_train_full: (1594,), X_test: (532,)
Final modeling split (train/val/test):
X_train: (1275,), X_val: (319,), X_test: (532,)


def expected_calibration_error(y_true, y_prob, n_bins=10):
    # Multiclass ECE based on max confidence vs correctness
    y_true = np.asarray(y_true)
    confidences = y_prob.max(axis=1)
    preds = y_prob.argmax(axis=1)
    y_encoded = pd.Categorical(y_true).codes
    correctness = (preds == y_encoded).astype(int)

    bins = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        mask = (confidences > bins[i]) & (confidences <= bins[i + 1])
        if mask.sum() == 0:
            continue
        bin_acc = correctness[mask].mean()
        bin_conf = confidences[mask].mean()
        ece += (mask.sum() / len(y_true)) * abs(bin_acc - bin_conf)
    return ece

def evaluate_model(name, estimator, X_tr, y_tr, X_ev, y_ev):
    estimator.fit(X_tr, y_tr)
    pred = estimator.predict(X_ev)

    result = {
        'model': name,
        'accuracy': accuracy_score(y_ev, pred),
        'precision_macro': precision_score(y_ev, pred, average='macro', zero_division=0),
        'recall_macro': recall_score(y_ev, pred, average='macro', zero_division=0),
        'f1_macro': f1_score(y_ev, pred, average='macro', zero_division=0)
    }

    lb = LabelBinarizer()
    y_ev_bin = lb.fit_transform(y_ev)

    scores = None
    if hasattr(estimator, 'predict_proba'):
        scores = estimator.predict_proba(X_ev)
    elif hasattr(estimator, 'decision_function'):
        scores = estimator.decision_function(X_ev)

    if scores is not None:
        try:
            result['roc_auc_ovr_macro'] = roc_auc_score(y_ev, scores, multi_class='ovr', average='macro')
        except Exception:
            result['roc_auc_ovr_macro'] = np.nan
        try:
            if scores.ndim == 1:
                result['pr_auc_macro'] = np.nan
            else:
                if y_ev_bin.shape[1] == 1:
                    result['pr_auc_macro'] = np.nan
                else:
                    result['pr_auc_macro'] = average_precision_score(y_ev_bin, scores, average='macro')
        except Exception:
            result['pr_auc_macro'] = np.nan
    else:
        result['roc_auc_ovr_macro'] = np.nan
        result['pr_auc_macro'] = np.nan

    if hasattr(estimator, 'predict_proba'):
        try:
            prob = estimator.predict_proba(X_ev)
            result['ece'] = expected_calibration_error(y_ev, prob)
        except Exception:
            result['ece'] = np.nan
    else:
        result['ece'] = np.nan

    return result, estimator


# Baseline model set
baseline_models = {
    'Dummy_MostFrequent': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', min_df=2, max_df=0.95)),
        ('clf', DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE))
    ]),
    'NaiveBayes_BOW': Pipeline([
        ('bow', CountVectorizer(stop_words='english', min_df=2, max_df=0.95)),
        ('clf', MultinomialNB())
    ]),
    'KNN_TFIDF': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=3, max_features=5000)),
        ('clf', KNeighborsClassifier(n_neighbors=7, metric='cosine', weights='distance'))
    ]),
    'RandomForest_TFIDF': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=3, max_features=5000)),
        ('clf', RandomForestClassifier(n_estimators=250, random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced_subsample'))
    ]),
    'LogReg_TFIDF': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.95)),
        ('clf', LogisticRegression(max_iter=3000, random_state=RANDOM_STATE, n_jobs=-1))
    ]),
    'LinearSVC_TFIDF': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.95)),
        ('clf', LinearSVC(random_state=RANDOM_STATE))
    ])
}

baseline_results = []
fitted_baselines = {}

for name, model in baseline_models.items():
    res, fitted = evaluate_model(name, model, X_train, y_train, X_val, y_val)
    baseline_results.append(res)
    fitted_baselines[name] = fitted

baseline_df = pd.DataFrame(baseline_results).sort_values('f1_macro', ascending=False).reset_index(drop=True)
baseline_df


# Strategy A: Tuned Logistic Regression (word n-grams)
lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=4000, random_state=RANDOM_STATE, n_jobs=-1))
])

lr_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [0.9, 0.95],
    'clf__C': [0.5, 1.0, 2.0]
}

lr_gs = GridSearchCV(
    estimator=lr_pipe,
    param_grid=lr_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=0
)
lr_gs.fit(X_train, y_train)

print('Best LR params:', lr_gs.best_params_)
print('Best LR CV macro-F1:', round(lr_gs.best_score_, 4))

lr_val_res, tuned_lr = evaluate_model('Tuned_LogReg', lr_gs.best_estimator_, X_train, y_train, X_val, y_val)
pd.DataFrame([lr_val_res])

Best LR params: {'clf__C': 2.0, 'tfidf__max_df': 0.9, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Best LR CV macro-F1: 0.9718


# Strategy B: Tuned character n-gram Linear SVM with calibration
svm_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', analyzer='char_wb')),
    ('clf', LinearSVC(random_state=RANDOM_STATE))
])

svm_grid = {
    'tfidf__ngram_range': [(3, 5), (4, 6)],
    'tfidf__min_df': [2, 3],
    'clf__C': [0.5, 1.0, 2.0]
}

svm_gs = GridSearchCV(
    estimator=svm_pipe,
    param_grid=svm_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=0
)
svm_gs.fit(X_train, y_train)

# Calibrate for probability-based metrics
best_svm = svm_gs.best_estimator_
calibrated_svm = CalibratedClassifierCV(best_svm, method='sigmoid', cv=3)
svm_val_res, tuned_svm = evaluate_model('Tuned_CharSVM_Calibrated', calibrated_svm, X_train, y_train, X_val, y_val)

print('Best SVM params:', svm_gs.best_params_)
print('Best SVM CV macro-F1:', round(svm_gs.best_score_, 4))
pd.DataFrame([svm_val_res])

Best SVM params: {'clf__C': 0.5, 'tfidf__min_df': 3, 'tfidf__ngram_range': (4, 6)}
Best SVM CV macro-F1: 0.9717


# Compare baseline winner + tuned alternatives on validation
val_compare_df = baseline_df.copy()
val_compare_df['stage'] = 'baseline'

tuned_df = pd.DataFrame([lr_val_res, svm_val_res])
tuned_df['stage'] = 'tuned'

all_val_df = pd.concat([val_compare_df, tuned_df], ignore_index=True)
all_val_df = all_val_df.sort_values('f1_macro', ascending=False).reset_index(drop=True)
display(all_val_df)

best_model_name = all_val_df.loc[0, 'model']
if best_model_name == 'Tuned_LogReg':
    best_model = tuned_lr
elif best_model_name == 'Tuned_CharSVM_Calibrated':
    best_model = tuned_svm
else:
    best_model = fitted_baselines[best_model_name]

print(f'Selected model for final training: {best_model_name}')

Selected model for final training: Tuned_CharSVM_Calibrated


# Stability check: repeated stratified CV on train_full
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_STATE)
cv_scores = cross_val_score(best_model, X_train_full, y_train_full, cv=cv, scoring='f1_macro', n_jobs=-1)

stability_df = pd.DataFrame({
    'cv_macro_f1_mean': [cv_scores.mean()],
    'cv_macro_f1_std': [cv_scores.std()],
    'cv_macro_f1_min': [cv_scores.min()],
    'cv_macro_f1_max': [cv_scores.max()]
})
stability_df


# Refit selected model on train+validation, then evaluate once on holdout test
best_model.fit(X_train_full, y_train_full)
test_pred = best_model.predict(X_test)

test_metrics = {
    'accuracy': accuracy_score(y_test, test_pred),
    'precision_macro': precision_score(y_test, test_pred, average='macro', zero_division=0),
    'recall_macro': recall_score(y_test, test_pred, average='macro', zero_division=0),
    'f1_macro': f1_score(y_test, test_pred, average='macro', zero_division=0)
}

if hasattr(best_model, 'predict_proba'):
    test_scores = best_model.predict_proba(X_test)
    test_metrics['roc_auc_ovr_macro'] = roc_auc_score(y_test, test_scores, multi_class='ovr', average='macro')
    y_test_bin = LabelBinarizer().fit_transform(y_test)
    test_metrics['pr_auc_macro'] = average_precision_score(y_test_bin, test_scores, average='macro') if y_test_bin.shape[1] > 1 else np.nan
    test_metrics['ece'] = expected_calibration_error(y_test, test_scores)
else:
    test_metrics['roc_auc_ovr_macro'] = np.nan
    test_metrics['pr_auc_macro'] = np.nan
    test_metrics['ece'] = np.nan

test_metrics_df = pd.DataFrame([test_metrics])
display(test_metrics_df)

print('Classification Report (Test):')
print(classification_report(y_test, test_pred, digits=4))

fig, ax = plt.subplots(figsize=(7, 6))
ConfusionMatrixDisplay.from_predictions(y_test, test_pred, cmap='Blues', ax=ax, xticks_rotation=25)
ax.set_title('Confusion Matrix - Holdout Test')
plt.tight_layout()
plt.show()

Classification Report (Test):
               precision    recall  f1-score   support

     Business     0.9760    0.9683    0.9721       126
Entertainment     0.9674    0.9674    0.9674        92
     Politics     0.9706    0.9802    0.9754       101
       Sports     0.9921    1.0000    0.9960       126
   Technology     0.9767    0.9655    0.9711        87

     accuracy                         0.9774       532
    macro avg     0.9766    0.9763    0.9764       532
 weighted avg     0.9774    0.9774    0.9774       532


def top_terms_from_linear_pipeline(pipeline, top_n=15):
    # Supports Pipeline(vectorizer -> linear classifier) and calibrated wrappers
    est = pipeline
    if hasattr(pipeline, 'base_estimator'):
        est = pipeline.base_estimator

    if not hasattr(est, 'named_steps'):
        return None

    vec = est.named_steps.get('tfidf') or est.named_steps.get('bow')
    clf = est.named_steps.get('clf')

    if vec is None or clf is None or not hasattr(clf, 'coef_'):
        return None

    terms = np.array(vec.get_feature_names_out())
    classes = clf.classes_
    coef = clf.coef_

    records = []
    for i, cls in enumerate(classes):
        top_idx = np.argsort(coef[i])[-top_n:][::-1]
        top_terms = ', '.join(terms[top_idx])
        records.append({'class': cls, 'top_predictive_terms': top_terms})

    return pd.DataFrame(records)

term_df = top_terms_from_linear_pipeline(best_model, top_n=12)
if term_df is None:
    print('Top-term interpretation unavailable for this model family. Use permutation importance as fallback.')
else:
    display(term_df)

Top-term interpretation unavailable for this model family. Use permutation importance as fallback.


# Error analysis for stakeholder-friendly operational insights
error_df = pd.DataFrame({
    'text': X_test.values,
    'actual': y_test.values,
    'predicted': test_pred
})
error_df['is_error'] = error_df['actual'] != error_df['predicted']

print(f'Misclassified samples: {error_df.is_error.sum():,} / {len(error_df):,}')
display(error_df[error_df['is_error']].head(10))

Misclassified samples: 12 / 532


summary_rows = [
    {'Area': 'Data Volume', 'Finding': f"{len(df):,} labeled articles across {df['Category'].nunique()} categories", 'Business Impact': 'Sufficient for strong classical NLP baseline'},
    {'Area': 'Class Mix', 'Finding': df['Category'].value_counts().to_dict(), 'Business Impact': 'Moderately balanced labels reduce fairness risk'},
    {'Area': 'Best Validation Model', 'Finding': best_model_name, 'Business Impact': 'Model selected via macro-F1 and leakage-safe validation'},
    {'Area': 'Holdout Test Macro-F1', 'Finding': round(test_metrics['f1_macro'], 4), 'Business Impact': 'Expected generalization quality on unseen data'},
    {'Area': 'Stability (CV std)', 'Finding': round(float(stability_df['cv_macro_f1_std'].iloc[0]), 4), 'Business Impact': 'Lower variance indicates reliable deployment behavior'},
    {'Area': 'Calibration (ECE)', 'Finding': round(float(test_metrics['ece']), 4) if pd.notna(test_metrics['ece']) else 'NA', 'Business Impact': 'Probability reliability for threshold-based workflows'}
]

final_summary_df = pd.DataFrame(summary_rows)
final_summary_df

	count	mean	std	min	25%	50%	75%	max
article_char_count	2225.0	2262.936180	1364.102530	501.000000	1446.000000	1965.000000	2802.000000	25483.000000
article_word_count	2225.0	390.295281	241.753128	90.000000	250.000000	337.000000	479.000000	4492.000000
article_sentence_count	2225.0	20.559551	12.464056	5.000000	14.000000	18.000000	25.000000	247.000000
article_avg_word_len	2225.0	5.812172	0.267699	5.009852	5.642241	5.815409	5.989041	6.991379
article_unique_ratio	2225.0	0.573595	0.066291	0.304202	0.529833	0.569288	0.616379	0.905172

	count	mean	median	std	min	max
Category
Technology	401	507.356608	450.0	242.378774	165	3007
Politics	417	461.194245	449.0	304.527074	90	4492
Entertainment	386	337.676166	267.0	268.532851	144	3570
Sports	511	336.322896	296.0	191.445419	116	1671
Business	510	334.186275	303.0	138.009377	142	912

	model	accuracy	precision_macro	recall_macro	f1_macro	roc_auc_ovr_macro	pr_auc_macro	ece
0	LinearSVC_TFIDF	0.978056	0.978265	0.977767	0.977960	NaN	0.992922	NaN
1	LogReg_TFIDF	0.974922	0.974556	0.973921	0.974217	0.998150	0.993811	0.334020
2	NaiveBayes_BOW	0.962382	0.961426	0.960590	0.960298	0.997056	0.987633	0.038255
3	RandomForest_TFIDF	0.952978	0.954600	0.952087	0.953205	0.996193	0.988322	0.259737
4	KNN_TFIDF	0.921630	0.918459	0.919626	0.918579	0.991939	0.975213	0.045568
5	Dummy_MostFrequent	0.238245	0.047649	0.200000	0.076962	0.500000	0.200000	0.761755

	model	accuracy	precision_macro	recall_macro	f1_macro	roc_auc_ovr_macro	pr_auc_macro	ece	stage
0	Tuned_CharSVM_Calibrated	0.984326	0.983950	0.983732	0.983690	0.997145	0.993899	0.052294	tuned
1	LinearSVC_TFIDF	0.978056	0.978265	0.977767	0.977960	NaN	0.992922	NaN	baseline
2	Tuned_LogReg	0.978056	0.977254	0.977254	0.977254	0.998234	0.994330	0.249079	tuned
3	LogReg_TFIDF	0.974922	0.974556	0.973921	0.974217	0.998150	0.993811	0.334020	baseline
4	NaiveBayes_BOW	0.962382	0.961426	0.960590	0.960298	0.997056	0.987633	0.038255	baseline
5	RandomForest_TFIDF	0.952978	0.954600	0.952087	0.953205	0.996193	0.988322	0.259737	baseline
6	KNN_TFIDF	0.921630	0.918459	0.919626	0.918579	0.991939	0.975213	0.045568	baseline
7	Dummy_MostFrequent	0.238245	0.047649	0.200000	0.076962	0.500000	0.200000	0.761755	baseline

Company A News NLP Categorization¶

Executive Summary (for Business Leaders)¶

1) Business Framing and Problem Definition¶

Objective¶

Why this matters¶

Success Criteria¶

Constraints¶

2) Data Audit (Schema, Types, Missingness, Duplicates)¶

Data Audit Insight¶

3) Robust EDA¶

Distribution Insight¶

Outlier and Skewness Insight¶

Bivariate/Multivariate Insight¶

4) Preprocessing and Feature Engineering (with Rationale)¶

Preprocessing Outcome¶

5) Baseline Models First¶

Baseline Commentary¶

6) Hyperparameter Tuning + Alternative Modeling Strategies¶

Tuning and Strategy Insight¶

Stability Insight¶

7) Final Model Evaluation on Holdout Test¶

Holdout Performance Insight¶

8) Model Interpretation (Explainability)¶

Interpretation Insight¶

9) Business Insight Snapshot (First Run)¶

Consolidated Output Summary¶

10) Final Findings Summary Table¶

11) Recommendations and Trade-Offs¶

12) Executive Summary for LinkedIn¶

13) Risks, Assumptions, and Monitoring Plan¶

Risks¶

Assumptions¶

Monitoring Plan¶

	Category	Article
0	Technology	tv future in the hands of viewers with home theatre systems plasma high-definition tvs and digital video recorders...
1	Business	worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseeing an $11bn (£5.8bn) ...
2	Sports	tigers wary of farrell gamble leicester say they will not be rushed into making a bid for andy farrell should the ...

	feature	skewness
2	article_sentence_count	7.106646
1	article_word_count	6.035650
0	article_char_count	5.640058
4	article_unique_ratio	0.171343
3	article_avg_word_len	0.010000

	Category	Article	Article_clean
0	Technology	tv future in the hands of viewers with home theatre systems plasma high-definition tvs and digital video recorders...	future the hands viewers with home theatre systems plasma high definition tvs and digital video recorders moving int...
1	Business	worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseeing an $11bn (£5.8bn) ...	worldcom boss left books alone former worldcom boss bernie ebbers who accused overseeing fraud never made accounting...

	text	actual	predicted	is_error
3	show unites angolan families angolan families who are attempting track each other down after being separated nearly ...	Entertainment	Politics	True
37	card fraudsters targeting web new safeguards credit and debit card payments shops has led fraudsters focus internet ...	Business	Technology	True
77	web radio takes spanish rap global spin the radio dial the and you are likely find plenty spanish language music but...	Technology	Entertainment	True
97	fraud clampdown urged member states are failing report fraud and irregularities funds consistent basis the public sp...	Politics	Business	True
149	holmes wins top moment sprinter kelly holmes olympic victory has been named the top television moment bbc poll holme...	Entertainment	Sports	True
164	christmas shoppers flock tills shops all over the reported strong sales the last saturday before christmas with some...	Business	Entertainment	True
167	argonaut founder rebuilds empire jez san the man behind the argonaut games group which went into administration week...	Technology	Business	True
259	shock long lost daughter labour stephen pound has told his shock and his joy learning fathered daughter when was out...	Politics	Entertainment	True
263	young top euro earnings league british children enjoy the highest average annual income europe more than double that...	Business	Politics	True
302	games maker fights for survival one britain largest independent game makers argonaut games has been put for sale the...	Technology	Business	True

	Area	Finding	Business Impact
0	Data Volume	2,225 labeled articles across 5 categories	Sufficient for strong classical NLP baseline
1	Class Mix	{'Sports': 511, 'Business': 510, 'Politics': 417, 'Technology': 401, 'Entertainment': 386}	Moderately balanced labels reduce fairness risk
2	Best Validation Model	Tuned_CharSVM_Calibrated	Model selected via macro-F1 and leakage-safe validation
3	Holdout Test Macro-F1	0.9764	Expected generalization quality on unseen data
4	Stability (CV std)	0.0075	Lower variance indicates reliable deployment behavior
5	Calibration (ECE)	0.0389	Probability reliability for threshold-based workflows

	column	dtype	missing_count	missing_pct	n_unique
0	Category	str	0	0.0	5
1	Article	str	0	0.0	2126