run_environtment = 'google_colab'

if run_environtment == 'google_colab':
  from google.colab import drive
  drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


import os
import math
import json
import random
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix, roc_auc_score, average_precision_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Favor throughput for this training notebook; enable strict determinism only when required.
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
if hasattr(torch, "set_float32_matmul_precision"):
    torch.set_float32_matmul_precision("high")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LOGICAL_CPUS = os.cpu_count() or 2

if device.type == "cuda":
    gpu_props = torch.cuda.get_device_properties(0)
    GPU_NAME = gpu_props.name
    GPU_MEMORY_GB = round(gpu_props.total_memory / (1024 ** 3), 2)
else:
    GPU_NAME = None
    GPU_MEMORY_GB = 0.0

print({
    "device": str(device),
    "torch_version": torch.__version__,
    "torch_cuda": torch.version.cuda,
    "logical_cpus": LOGICAL_CPUS,
    "gpu_name": GPU_NAME,
    "gpu_memory_gb": GPU_MEMORY_GB
})

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_context("talk")

DATA_DIR = os.path.join("..", "data")
if run_environtment == 'google_colab':
  DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/DS_Projects/data/NLP'
TRAIN_PATH = os.path.join(DATA_DIR, "wnut 16.txt.conll")
TEST_PATH = os.path.join(DATA_DIR, "wnut 16test.txt.conll")
ARTIFACT_DIR = os.path.join("..", "models")
os.makedirs(ARTIFACT_DIR, exist_ok=True)

{'device': 'cpu', 'torch_version': '2.10.0+cpu', 'torch_cuda': None, 'logical_cpus': 2, 'gpu_name': None, 'gpu_memory_gb': 0.0}


def read_conll(path):
    sentences = []
    tags = []
    cur_tokens, cur_tags = [], []

    with open(path, "r", encoding="utf-8") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                if cur_tokens:
                    sentences.append(cur_tokens)
                    tags.append(cur_tags)
                    cur_tokens, cur_tags = [], []
                continue

            parts = line.split()
            if len(parts) < 2:
                continue
            token = " ".join(parts[:-1])
            tag = parts[-1]
            cur_tokens.append(token)
            cur_tags.append(tag)

    if cur_tokens:
        sentences.append(cur_tokens)
        tags.append(cur_tags)

    return sentences, tags

train_sentences, train_tags = read_conll(TRAIN_PATH)
test_sentences, test_tags = read_conll(TEST_PATH)

def to_token_df(sentences, tags, split_name):
    rows = []
    for sid, (s, t) in enumerate(zip(sentences, tags)):
        for i, (tok, tag) in enumerate(zip(s, t)):
            rows.append({"split": split_name, "sentence_id": sid, "token_idx": i, "token": tok, "tag": tag})
    return pd.DataFrame(rows)

df_train = to_token_df(train_sentences, train_tags, "train")
df_test = to_token_df(test_sentences, test_tags, "test")
df_all = pd.concat([df_train, df_test], ignore_index=True)

print("Train tweets:", len(train_sentences), "Test tweets:", len(test_sentences))
print("Train tokens:", len(df_train), "Test tokens:", len(df_test))
df_all.head()

Train tweets: 2394 Test tweets: 3850
Train tokens: 46469 Test tokens: 61908


schema_df = pd.DataFrame({
    "column": df_all.columns,
    "dtype": [str(df_all[c].dtype) for c in df_all.columns],
    "missing": [int(df_all[c].isna().sum()) for c in df_all.columns],
    "missing_pct": [float(df_all[c].isna().mean() * 100) for c in df_all.columns]
})

duplicate_rows = int(df_all.duplicated().sum())
duplicate_token_within_sentence = int(df_all.duplicated(["split", "sentence_id", "token_idx"]).sum())

print("Schema and missingness:")
display(schema_df)
print("Duplicate full rows:", duplicate_rows)
print("Duplicate split-sentence-token_idx keys:", duplicate_token_within_sentence)

Schema and missingness:

Duplicate full rows: 0
Duplicate split-sentence-token_idx keys: 0


train_tag_counts = df_train["tag"].value_counts().rename_axis("tag").reset_index(name="count")
train_tag_counts["pct"] = 100 * train_tag_counts["count"] / train_tag_counts["count"].sum()

sent_len_train = pd.Series([len(x) for x in train_sentences], name="tweet_length")
q1, q3 = sent_len_train.quantile([0.25, 0.75])
iqr = q3 - q1
outlier_low = q1 - 1.5 * iqr
outlier_high = q3 + 1.5 * iqr
outlier_count = int(((sent_len_train < outlier_low) | (sent_len_train > outlier_high)).sum())

print("Sentence length skewness:", round(sent_len_train.skew(), 3))
print(f"IQR bounds: [{outlier_low:.2f}, {outlier_high:.2f}] | Outlier tweets: {outlier_count}")

fig, axes = plt.subplots(1, 2, figsize=(16, 5))
sns.barplot(data=train_tag_counts.head(12), y="tag", x="count", ax=axes[0], palette="viridis")
axes[0].set_title("Top Entity Tags by Frequency (Train)")

sns.histplot(sent_len_train, bins=30, kde=True, ax=axes[1], color="teal")
axes[1].axvline(sent_len_train.mean(), color="red", linestyle="--", label="mean")
axes[1].set_title("Tweet Length Distribution")
axes[1].legend()
plt.tight_layout()
plt.show()

display(train_tag_counts.head(20))

Sentence length skewness: -0.11
IQR bounds: [-2.50, 41.50] | Outlier tweets: 0

/tmp/ipykernel_8004/1760184833.py:15: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=train_tag_counts.head(12), y="tag", x="count", ax=axes[0], palette="viridis")


# Bivariate: entity presence vs tweet length bins
entity_presence = []
for s_tags, s_tokens in zip(train_tags, train_sentences):
    entity_presence.append({
        "has_entity": int(any(t != "O" for t in s_tags)),
        "tweet_length": len(s_tokens)
    })
edf = pd.DataFrame(entity_presence)
edf["len_bin"] = pd.cut(edf["tweet_length"], bins=[0, 5, 10, 15, 20, 40, 80], include_lowest=True)
bivar = edf.groupby(["len_bin", "has_entity"], observed=False).size().reset_index(name="count")

plt.figure(figsize=(12, 5))
sns.barplot(data=bivar, x="len_bin", y="count", hue="has_entity", palette="mako")
plt.title("Entity Presence by Tweet Length Bin")
plt.xticks(rotation=35, ha="right")
plt.show()

# Multivariate: tag transition matrix for top tags
transitions = Counter()
for seq in train_tags:
    for a, b in zip(seq[:-1], seq[1:]):
        transitions[(a, b)] += 1

top_tags = [t for t, _ in df_train["tag"].value_counts().head(8).items()]
mat = pd.DataFrame(0, index=top_tags, columns=top_tags)
for (a, b), c in transitions.items():
    if a in top_tags and b in top_tags:
        mat.loc[a, b] += c

plt.figure(figsize=(9, 7))
sns.heatmap(mat, cmap="YlGnBu", annot=False)
plt.title("Tag Transition Heatmap (Top Tags)")
plt.show()

display(bivar.head(20))


# Sentence-level split for leakage-safe validation
idx = np.arange(len(train_sentences))
has_entity = np.array([int(any(t != "O" for t in seq)) for seq in train_tags])
train_idx, val_idx = train_test_split(idx, test_size=0.2, random_state=SEED, stratify=has_entity)

X_train_sent = [train_sentences[i] for i in train_idx]
y_train_sent = [train_tags[i] for i in train_idx]
X_val_sent = [train_sentences[i] for i in val_idx]
y_val_sent = [train_tags[i] for i in val_idx]

X_test_sent = test_sentences
y_test_sent = test_tags

all_tags = sorted(df_train["tag"].unique())
print("Total tag classes:", len(all_tags))
print("Train sentences:", len(X_train_sent), "Validation sentences:", len(X_val_sent), "Test sentences:", len(X_test_sent))

Total tag classes: 21
Train sentences: 1915 Validation sentences: 479 Test sentences: 3850


def flatten(list_of_lists):
    return [x for sub in list_of_lists for x in sub]

def token_features(sent, i):
    tok = sent[i]
    prev_tok = sent[i - 1] if i > 0 else "<START>"
    next_tok = sent[i + 1] if i < len(sent) - 1 else "<END>"
    return {
        "tok": tok,
        "tok_lower": tok.lower(),
        "is_upper": tok.isupper(),
        "is_title": tok.istitle(),
        "is_digit": tok.isdigit(),
        "prefix1": tok[:1],
        "prefix2": tok[:2],
        "suffix1": tok[-1:],
        "suffix2": tok[-2:],
        "prev_lower": prev_tok.lower(),
        "next_lower": next_tok.lower(),
        "has_hash": "#" in tok,
        "has_at": "@" in tok,
        "has_url_hint": ("http" in tok.lower()) or ("www" in tok.lower())
    }

def build_xy(sentences, tags):
    X, y = [], []
    for sent, t_seq in zip(sentences, tags):
        for i in range(len(sent)):
            X.append(token_features(sent, i))
            y.append(t_seq[i])
    return X, y

X_train_dict, y_train_flat = build_xy(X_train_sent, y_train_sent)
X_val_dict, y_val_flat = build_xy(X_val_sent, y_val_sent)
X_test_dict, y_test_flat = build_xy(X_test_sent, y_test_sent)

vectorizer = DictVectorizer(sparse=True)
X_train_vec = vectorizer.fit_transform(X_train_dict)
X_val_vec = vectorizer.transform(X_val_dict)
X_test_vec = vectorizer.transform(X_test_dict)

print("Vectorized feature shape (train):", X_train_vec.shape)

Vectorized feature shape (train): (37160, 34419)


def macro_non_o_metrics(y_true, y_pred, labels):
    target_labels = [l for l in labels if l != "O"]
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, labels=target_labels, average="macro", zero_division=0)
    return {"precision_non_o_macro": p, "recall_non_o_macro": r, "f1_non_o_macro": f1}

# Baseline 1: Majority class (naive)
majority_tag = pd.Series(y_train_flat).value_counts().idxmax()
maj_val_pred = [majority_tag] * len(y_val_flat)
maj_test_pred = [majority_tag] * len(y_test_flat)

maj_val_metrics = macro_non_o_metrics(y_val_flat, maj_val_pred, all_tags)
maj_test_metrics = macro_non_o_metrics(y_test_flat, maj_test_pred, all_tags)
print("Majority tag baseline (Val):", maj_val_metrics)
print("Majority tag baseline (Test):", maj_test_metrics)

# Baseline 2 + tuning: SGDClassifier (logistic loss)
alphas = [1e-5, 5e-5, 1e-4, 5e-4]
best_alpha = None
best_f1 = -1
best_model = None

for alpha in alphas:
    clf = SGDClassifier(loss="log_loss", alpha=alpha, max_iter=50, tol=1e-3, random_state=SEED, n_jobs=-1, class_weight="balanced")
    clf.fit(X_train_vec, y_train_flat)
    val_pred = clf.predict(X_val_vec)
    score = macro_non_o_metrics(y_val_flat, val_pred, all_tags)["f1_non_o_macro"]
    if score > best_f1:
        best_f1 = score
        best_alpha = alpha
        best_model = clf

print(f"Best alpha: {best_alpha} | Val non-O macro F1: {best_f1:.4f}")

Majority tag baseline (Val): {'precision_non_o_macro': 0.0, 'recall_non_o_macro': 0.0, 'f1_non_o_macro': 0.0}
Majority tag baseline (Test): {'precision_non_o_macro': 0.0, 'recall_non_o_macro': 0.0, 'f1_non_o_macro': 0.0}
Best alpha: 1e-05 | Val non-O macro F1: 0.3896


sgd_val_pred = best_model.predict(X_val_vec)
sgd_test_pred = best_model.predict(X_test_vec)

sgd_val_proba = best_model.predict_proba(X_val_vec)
sgd_test_proba = best_model.predict_proba(X_test_vec)

lb = LabelEncoder().fit(all_tags)
y_val_bin = label_binarize(lb.transform(y_val_flat), classes=np.arange(len(lb.classes_)))
y_test_bin = label_binarize(lb.transform(y_test_flat), classes=np.arange(len(lb.classes_)))

def safe_auc(y_bin, y_proba):
    try:
        roc = roc_auc_score(y_bin, y_proba, average="macro", multi_class="ovr")
    except Exception:
        roc = np.nan
    try:
        pr = average_precision_score(y_bin, y_proba, average="macro")
    except Exception:
        pr = np.nan
    return roc, pr

val_roc, val_pr = safe_auc(y_val_bin, sgd_val_proba)
test_roc, test_pr = safe_auc(y_test_bin, sgd_test_proba)

sgd_val_metrics = macro_non_o_metrics(y_val_flat, sgd_val_pred, all_tags)
sgd_test_metrics = macro_non_o_metrics(y_test_flat, sgd_test_pred, all_tags)

print("SGD tuned model - validation:", sgd_val_metrics, "| ROC-AUC:", round(val_roc, 4), "| PR-AUC:", round(val_pr, 4))
print("SGD tuned model - test:", sgd_test_metrics, "| ROC-AUC:", round(test_roc, 4), "| PR-AUC:", round(test_pr, 4))

print("\nToken-level classification report (test, top labels):")
print(classification_report(y_test_flat, sgd_test_pred, labels=all_tags, zero_division=0, digits=3))

SGD tuned model - validation: {'precision_non_o_macro': 0.5729025010290733, 'recall_non_o_macro': 0.3579436388360801, 'f1_non_o_macro': 0.389556528833203} | ROC-AUC: 0.9158 | PR-AUC: 0.3962
SGD tuned model - test: {'precision_non_o_macro': 0.21970364198700687, 'recall_non_o_macro': 0.10277089217834683, 'f1_non_o_macro': 0.11101606717485009} | ROC-AUC: 0.8122 | PR-AUC: 0.1366

Token-level classification report (test, top labels):
               precision    recall  f1-score   support

    B-company      0.467     0.161     0.240       621
   B-facility      0.435     0.198     0.272       253
    B-geo-loc      0.497     0.514     0.505       882
      B-movie      0.100     0.029     0.045        34
B-musicartist      0.000     0.000     0.000       191
      B-other      0.147     0.144     0.145       584
     B-person      0.199     0.417     0.269       482
    B-product      0.148     0.033     0.053       246
 B-sportsteam      0.200     0.020     0.037       147
     B-tvshow      0.000     0.000     0.000        33
    I-company      0.647     0.042     0.078       265
   I-facility      0.424     0.068     0.118       366
    I-geo-loc      0.381     0.037     0.067       219
      I-movie      0.000     0.000     0.000        48
I-musicartist      0.000     0.000     0.000       140
      I-other      0.197     0.183     0.190       556
     I-person      0.136     0.183     0.156       300
    I-product      0.333     0.006     0.012       500
 I-sportsteam      0.083     0.021     0.033        48
     I-tvshow      0.000     0.000     0.000        40
            O      0.940     0.974     0.957     55953

     accuracy                          0.898     61908
    macro avg      0.254     0.144     0.151     61908
 weighted avg      0.879     0.898     0.883     61908


# Interpretation for linear model: top weighted features per important classes
feat_names = np.array(vectorizer.get_feature_names_out())
classes = best_model.classes_
coef = best_model.coef_

target_interest = [c for c in classes if c != "O"][:6]
interpret_rows = []
for cls in target_interest:
    idx = np.where(classes == cls)[0][0]
    top_idx = np.argsort(coef[idx])[-8:][::-1]
    for j in top_idx:
        interpret_rows.append({"class": cls, "feature": feat_names[j], "weight": float(coef[idx, j])})

interp_df = pd.DataFrame(interpret_rows)
display(interp_df.head(30))


word_counts = Counter(tok.lower() for sent in X_train_sent for tok in sent)
vocab = {"<PAD>": 0, "<UNK>": 1}
for w, c in word_counts.items():
    if c >= 2:
        vocab[w] = len(vocab)

tag2id = {t: i for i, t in enumerate(sorted(all_tags))}
id2tag = {i: t for t, i in tag2id.items()}

def encode_sentence(sent):
    return [vocab.get(tok.lower(), vocab["<UNK>"]) for tok in sent]

def encode_tags(tseq):
    return [tag2id[t] for t in tseq]

class NerDataset(Dataset):
    def __init__(self, sents, tags):
        # Precompute tensors once to reduce per-batch CPU overhead.
        self.x = [torch.tensor(encode_sentence(s), dtype=torch.long) for s in sents]
        self.y = [torch.tensor(encode_tags(t), dtype=torch.long) for t in tags]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

def collate_fn(batch):
    xs, ys = zip(*batch)
    lens = [len(x) for x in xs]
    max_len = max(lens)

    x_pad = torch.zeros((len(xs), max_len), dtype=torch.long)
    y_pad = torch.full((len(xs), max_len), fill_value=-100, dtype=torch.long)

    for i, (x, y) in enumerate(zip(xs, ys)):
        x_pad[i, :len(x)] = x
        y_pad[i, :len(y)] = y

    return x_pad, y_pad, torch.tensor(lens, dtype=torch.long)

train_ds = NerDataset(X_train_sent, y_train_sent)
val_ds = NerDataset(X_val_sent, y_val_sent)
test_ds = NerDataset(X_test_sent, y_test_sent)

if device.type == "cuda":
    BATCH_SIZE = 64 if GPU_MEMORY_GB >= 8 else 32
else:
    BATCH_SIZE = 24

DATALOADER_WORKERS = min(8, max(2, LOGICAL_CPUS // 2))
PIN_MEMORY = device.type == "cuda"

dl_kwargs = {
    "num_workers": DATALOADER_WORKERS,
    "pin_memory": PIN_MEMORY,
}
if DATALOADER_WORKERS > 0:
    dl_kwargs["persistent_workers"] = True
    dl_kwargs["prefetch_factor"] = 2

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, **dl_kwargs)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, **dl_kwargs)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, **dl_kwargs)

print({
    "batch_size": BATCH_SIZE,
    "dataloader_workers": DATALOADER_WORKERS,
    "pin_memory": PIN_MEMORY
})

{'batch_size': 24, 'dataloader_workers': 2, 'pin_memory': False}


class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, num_tags, emb_dim=128, hidden_dim=192, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim // 2, num_layers=1, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_tags)

    def forward(self, x):
        z = self.emb(x)
        z, _ = self.lstm(z)
        z = self.drop(z)
        return self.fc(z)

AMP_ENABLED = device.type == "cuda"

def eval_loader(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    y_true, y_pred = [], []
    with torch.no_grad():
        for xb, yb, lens in loader:
            xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
            with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=AMP_ENABLED):
                logits = model(xb)
                loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1))
            total_loss += loss.item()

            preds = logits.argmax(-1).detach().cpu().numpy()
            y_np = yb.detach().cpu().numpy()
            for p_row, y_row in zip(preds, y_np):
                mask = y_row != -100
                y_true.extend(y_row[mask].tolist())
                y_pred.extend(p_row[mask].tolist())

    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred,
        labels=[tag2id[t] for t in tag2id if t != "O"],
        average="macro",
        zero_division=0
    )
    return total_loss / max(1, len(loader)), p, r, f1, y_true, y_pred

model = BiLSTMTagger(vocab_size=len(vocab), num_tags=len(tag2id)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(trainable_params, lr=2e-3, weight_decay=1e-2)
scaler = torch.amp.GradScaler("cuda", enabled=AMP_ENABLED)

max_epochs = 12
patience = 3
best_val_f1 = -1
best_state = None
wait = 0
history = []

for epoch in range(1, max_epochs + 1):
    model.train()
    train_loss = 0.0
    for xb, yb, lens in train_dl:
        xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=AMP_ENABLED):
            logits = model(xb)
            loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1))

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        train_loss += loss.item()

    val_loss, val_p, val_r, val_f1, _, _ = eval_loader(model, val_dl, criterion)
    row = {
        "epoch": epoch,
        "train_loss": train_loss / max(1, len(train_dl)),
        "val_loss": val_loss,
        "val_p": val_p,
        "val_r": val_r,
        "val_f1": val_f1
    }
    history.append(row)
    print(row)

    if val_f1 > best_val_f1 + 1e-4:
        best_val_f1 = val_f1
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping triggered at epoch {epoch}.")
            break

if best_state is not None:
    model.load_state_dict(best_state)

history_df = pd.DataFrame(history)
display(history_df)

{'epoch': 1, 'train_loss': 0.6315472554415464, 'val_loss': 0.3416497975587845, 'val_p': 0.0, 'val_r': 0.0, 'val_f1': 0.0}
{'epoch': 2, 'train_loss': 0.32090373858809473, 'val_loss': 0.31870328933000563, 'val_p': 0.0, 'val_r': 0.0, 'val_f1': 0.0}
{'epoch': 3, 'train_loss': 0.283434266410768, 'val_loss': 0.29011300429701803, 'val_p': 0.14285714285714285, 'val_r': 0.01967012239615602, 'val_f1': 0.0340158371040724}
{'epoch': 4, 'train_loss': 0.2406498217023909, 'val_loss': 0.269020726531744, 'val_p': 0.18015873015873016, 'val_r': 0.04067522842780501, 'val_f1': 0.06441873406182726}
{'epoch': 5, 'train_loss': 0.20831267209723592, 'val_loss': 0.26783784218132495, 'val_p': 0.21749999999999997, 'val_r': 0.05436909289010009, 'val_f1': 0.08388319741260918}
{'epoch': 6, 'train_loss': 0.1771689816378057, 'val_loss': 0.25499492399394513, 'val_p': 0.23805745393980687, 'val_r': 0.06865077611915174, 'val_f1': 0.10290274495580272}
{'epoch': 7, 'train_loss': 0.15021010828204454, 'val_loss': 0.2514362670481205, 'val_p': 0.2916548369610693, 'val_r': 0.08875440672064216, 'val_f1': 0.1249114935504398}
{'epoch': 8, 'train_loss': 0.1277371391421184, 'val_loss': 0.2624364599585533, 'val_p': 0.42650000000000005, 'val_r': 0.10733585448607327, 'val_f1': 0.16154556197658693}
{'epoch': 9, 'train_loss': 0.1107956247869879, 'val_loss': 0.28152144625782966, 'val_p': 0.4843411796536796, 'val_r': 0.12182970341594412, 'val_f1': 0.18307033779301857}
{'epoch': 10, 'train_loss': 0.09113021909724921, 'val_loss': 0.27317013368010523, 'val_p': 0.4123737373737374, 'val_r': 0.18075102198434895, 'val_f1': 0.22355717359245167}
{'epoch': 11, 'train_loss': 0.07828086472582071, 'val_loss': 0.2760264754295349, 'val_p': 0.3667083511726369, 'val_r': 0.18826009847104325, 'val_f1': 0.23296055454567086}
{'epoch': 12, 'train_loss': 0.0670258205384016, 'val_loss': 0.28611495569348333, 'val_p': 0.5164206125011209, 'val_r': 0.26572183657830134, 'val_f1': 0.3100919086946121}


_, val_p, val_r, val_f1, yv_true, yv_pred = eval_loader(model, val_dl, criterion)
_, test_p, test_r, test_f1, yt_true, yt_pred = eval_loader(model, test_dl, criterion)

bilstm_val_metrics = {"precision_non_o_macro": val_p, "recall_non_o_macro": val_r, "f1_non_o_macro": val_f1}
bilstm_test_metrics = {"precision_non_o_macro": test_p, "recall_non_o_macro": test_r, "f1_non_o_macro": test_f1}

print("BiLSTM validation metrics:", bilstm_val_metrics)
print("BiLSTM test metrics:", bilstm_test_metrics)

model_path = os.path.join(ARTIFACT_DIR, "bilstm_ner_wnut16.pt")
torch.save({
    "model_state_dict": model.state_dict(),
    "vocab": vocab,
    "tag2id": tag2id,
    "id2tag": id2tag,
    "seed": SEED
}, model_path)
print("Saved model artifact:", model_path)

BiLSTM validation metrics: {'precision_non_o_macro': 0.5164206125011209, 'recall_non_o_macro': 0.26572183657830134, 'f1_non_o_macro': 0.3100919086946121}
BiLSTM test metrics: {'precision_non_o_macro': 0.11603245523314501, 'recall_non_o_macro': 0.07092986556572056, 'f1_non_o_macro': 0.07913360431459673}
Saved model artifact: ../models/bilstm_ner_wnut16.pt


# Compare model outcomes and summarize trade-offs
results = pd.DataFrame([
    {"model": "Majority Tag Baseline", **maj_test_metrics, "roc_auc_macro_ovr": np.nan, "pr_auc_macro": np.nan},
    {"model": "Tuned SGD Token Classifier", **sgd_test_metrics, "roc_auc_macro_ovr": test_roc, "pr_auc_macro": test_pr},
    {"model": "BiLSTM Sequence Model (PyTorch)", **bilstm_test_metrics, "roc_auc_macro_ovr": np.nan, "pr_auc_macro": np.nan},
])
display(results.sort_values("f1_non_o_macro", ascending=False))

plt.figure(figsize=(10, 5))
plot_df = results.set_index("model")[["precision_non_o_macro", "recall_non_o_macro", "f1_non_o_macro"]]
plot_df.plot(kind="bar", figsize=(12, 5), colormap="tab20c")
plt.title("Model Comparison on Test (Non-O Macro Metrics)")
plt.xticks(rotation=20, ha="right")
plt.tight_layout()
plt.show()

<Figure size 1000x500 with 0 Axes>


# Stability check across seeds for tuned SGD as a fast proxy
seed_runs = []
for rs in [11, 21, 42]:
    clf = SGDClassifier(loss="log_loss", alpha=best_alpha, max_iter=50, tol=1e-3, random_state=rs, n_jobs=-1, class_weight="balanced")
    clf.fit(X_train_vec, y_train_flat)
    pred = clf.predict(X_val_vec)
    m = macro_non_o_metrics(y_val_flat, pred, all_tags)
    seed_runs.append({"seed": rs, **m})

stability_df = pd.DataFrame(seed_runs)
display(stability_df)

unstable = stability_df["f1_non_o_macro"].std() > 0.02
print("Unstable results flag:", unstable)
if unstable:
    alt_df = pd.DataFrame([
        {"alternative": "BiLSTM-CRF", "expected_gain": "Better boundary consistency", "cost": "Medium training complexity"},
        {"alternative": "DistilBERT token classifier", "expected_gain": "Higher contextual generalization", "cost": "Higher GPU and memory need"}
    ])
    display(alt_df)
else:
    print("Current pipeline is reasonably stable across tested seeds.")

Unstable results flag: False
Current pipeline is reasonably stable across tested seeds.

	len_bin	has_entity	count
0	(-0.001, 5.0]	0	60
1	(-0.001, 5.0]	1	7
2	(5.0, 10.0]	0	213
3	(5.0, 10.0]	1	66
4	(10.0, 15.0]	0	290
5	(10.0, 15.0]	1	138
6	(15.0, 20.0]	0	314
7	(15.0, 20.0]	1	189
8	(20.0, 40.0]	0	599
9	(20.0, 40.0]	1	518
10	(40.0, 80.0]	0	0
11	(40.0, 80.0]	1	0

	class	feature	weight
0	B-company	tok_lower=twitter	10.769429
1	B-company	prev_lower=@mckenziecomer	7.130837
2	B-company	next_lower=tube	7.130837
3	B-company	next_lower=navy	6.736761
4	B-company	tok_lower=youtube	6.633267
5	B-company	tok_lower=facebook	6.452400
6	B-company	prev_lower=from	6.232648
7	B-company	is_title	6.086474
8	B-facility	prev_lower=at	18.182325
9	B-facility	next_lower=fitzwilliam	17.428437
10	B-facility	next_lower=tall	15.492138
11	B-facility	prev_lower=to	12.849362
12	B-facility	next_lower=blast	12.832321
13	B-facility	next_lower=apollo	12.703043
14	B-facility	next_lower=funny	10.668499
15	B-facility	prev_lower=@	10.553440
16	B-geo-loc	next_lower=states	9.557807
17	B-geo-loc	prev_lower=in	9.202844
18	B-geo-loc	is_title	9.006722
19	B-geo-loc	prev_lower=to	8.520982
20	B-geo-loc	is_upper	7.822328
21	B-geo-loc	next_lower=york	7.097804
22	B-geo-loc	prev_lower=on	6.398578
23	B-geo-loc	tok_lower=uk	5.846091
24	B-movie	next_lower=town	26.324285
25	B-movie	tok=Winter	26.145873
26	B-movie	tok=Kick-Ass	25.146745
27	B-movie	tok_lower=kick-ass	25.146745
28	B-movie	tok_lower=winter	25.061682
29	B-movie	prev_lower=watching	23.628846

	model	precision_non_o_macro	recall_non_o_macro	f1_non_o_macro	roc_auc_macro_ovr	pr_auc_macro
1	Tuned SGD Token Classifier	0.219704	0.102771	0.111016	0.812225	0.136598
2	BiLSTM Sequence Model (PyTorch)	0.116032	0.070930	0.079134	NaN	NaN
0	Majority Tag Baseline	0.000000	0.000000	0.000000	NaN	NaN

	seed	precision_non_o_macro	recall_non_o_macro	f1_non_o_macro
0	11	0.631939	0.350484	0.395544
1	21	0.543066	0.359692	0.394947
2	42	0.572903	0.357944	0.389557

Tweet NER Case Study: Business-Ready Pipeline¶

Business Objective¶

Stakeholder Context¶

Confidentiality¶

Problem Definition and Success Criteria¶

Data Format: CoNLL BIO and Alternatives¶

What is the format used here?¶

Why tokenization is needed¶

Other annotation formats¶

EDA: Univariate, Bivariate, Multivariate, Outlier and Skewness¶

Preprocessing and Feature Engineering Rationale¶

PyTorch Sequence Model (BiLSTM) with Early Stopping¶

First-Run Output Snapshot (Business View)¶

Performance Insights and Business Interpretation¶

Measured performance from first run¶

Operational interpretation¶

Strategic model direction¶

Strategic Recommendations and Future Directions¶

Platform integration recommendations¶

Improvement roadmap¶

Future NLP developments and implications¶

Executive Summary (LinkedIn-Ready)¶

Risks, Assumptions, and Monitoring Plan¶

Risks¶

Assumptions¶

Monitoring Plan¶

	split	token_idx	token	tag
0	train	0	@SammieLynnsMom	O
1	train	1	@tg10781	O
2	train	2	they	O
3	train	3	will	O
4	train	4	be	O

	column	dtype
0	split	object
1	sentence_id	int64
2	token_idx	int64
3	token	object
4	tag	object

	tag	count	pct
0	O	44007	94.701844
1	B-person	449	0.966236
2	I-other	320	0.688631
3	B-geo-loc	276	0.593944
4	B-other	225	0.484194
5	I-person	215	0.462674
6	B-company	171	0.367987
7	I-facility	105	0.225957
8	B-facility	104	0.223805
9	B-product	97	0.208741
10	I-product	80	0.172158
11	I-musicartist	61	0.131270
12	B-musicartist	55	0.118358
13	B-sportsteam	51	0.109751
14	I-geo-loc	49	0.105447
15	I-movie	46	0.098991
16	I-company	36	0.077471
17	B-movie	34	0.073167
18	B-tvshow	34	0.073167
19	I-tvshow	31	0.066711

	epoch	train_loss	val_loss	val_p	val_r	val_f1
0	1	0.631547	0.341650	0.000000	0.000000	0.000000
1	2	0.320904	0.318703	0.000000	0.000000	0.000000
2	3	0.283434	0.290113	0.142857	0.019670	0.034016
3	4	0.240650	0.269021	0.180159	0.040675	0.064419
4	5	0.208313	0.267838	0.217500	0.054369	0.083883
5	6	0.177169	0.254995	0.238057	0.068651	0.102903
6	7	0.150210	0.251436	0.291655	0.088754	0.124911
7	8	0.127737	0.262436	0.426500	0.107336	0.161546
8	9	0.110796	0.281521	0.484341	0.121830	0.183070
9	10	0.091130	0.273170	0.412374	0.180751	0.223557
10	11	0.078281	0.276026	0.366708	0.188260	0.232961
11	12	0.067026	0.286115	0.516421	0.265722	0.310092