conda activate your_environment
conda install -c conda-forge scikit-learn pandas matplotlib gensim
pip install vaderSentiment

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import re

# Sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Supervised ML
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report,
)

import warnings
warnings.filterwarnings("ignore")

print("Libraries loaded.")

# Toy training set — real spam datasets are on UCI / Kaggle; this is for pedagogy.
train = pd.DataFrame({
    "text": [
        "Hi professor, could you confirm the deadline for the proposal?",
        "Dear student, the next lecture is at 9am in room B.",
        "Let me know if you have any questions about the exam.",
        "Office hours moved to Thursday this week.",
        "Your paper draft looks good — small comments attached.",
        "URGENT! You won $1,000,000! Click here to claim NOW!",
        "Get rich quick with this amazing new crypto opportunity.",
        "Hot singles in your area are waiting! Click to meet them.",
        "Make $5000 per week working from home, no experience needed.",
        "Congratulations!! You have been selected for a free iPhone!!",
    ],
    "label": ["ham"]*5 + ["spam"]*5,
})

spam_model = make_pipeline(CountVectorizer(), MultinomialNB())
spam_model.fit(train["text"], train["label"])
print(f"Model trained on {len(train)} messages ({(train['label']=='ham').sum()} ham, {(train['label']=='spam').sum()} spam).")

# Pre-built test set — chosen to expose specific failure modes of a small-corpus
# Naive Bayes model. Discuss each prediction with the class.
test_messages = [
    "URGENT! Free iPhone! Click here NOW to claim your prize!!",
    "Free cruise to the Bahamas — respond today!",
    "Could you confirm the deadline for my paper draft?",
    "Are you free for coffee on Thursday?",
    "URGENT: please send me the slides before the seminar tomorrow.",
]

predictions = spam_model.predict(test_messages)
probs = spam_model.predict_proba(test_messages)
classes = spam_model.classes_

print(f"{'Pred':<6} {'P(spam)':<10} Message")
print("-" * 90)
for msg, pred, p in zip(test_messages, predictions, probs):
    spam_p = p[list(classes).index("spam")]
    print(f"[{pred}] {spam_p:>6.2f}     {msg}")

# Load the corpus we built in L5/L6 (and used throughout L7)
try:
    corpus = pd.read_csv("corpus_clean.csv")
    corpus["date_parsed"] = pd.to_datetime(corpus["date_parsed"], errors="coerce")
    corpus["year"] = corpus["date_parsed"].dt.year
    print(f"Corpus loaded: {len(corpus)} documents")
except FileNotFoundError:
    corpus = pd.DataFrame({
        "date_parsed": pd.to_datetime(["2024-01-25","2023-10-26","2023-09-14",
                                        "2022-12-15","2022-09-08"]),
        "title": ["Monetary policy decisions"] * 5,
        "body_clean": [
            "The Governing Council today decided to keep the three key ECB interest rates unchanged. "
            "Inflation is projected to decline gradually but will remain above the 2% target.",
            "The Governing Council kept rates unchanged. Underlying inflation eased and price pressures remain strong.",
            "The Governing Council raised the three key ECB interest rates by 25 basis points to ensure timely return of inflation to target.",
            "The Governing Council raised rates by 50 basis points. Interest rates will rise significantly to reach restrictive levels.",
            "The Governing Council raised the deposit facility rate by 75 basis points to ensure inflation returns to the 2% medium-term target.",
        ],
    })
    corpus["year"] = corpus["date_parsed"].dt.year
    print(f"Using synthetic corpus ({len(corpus)} documents).")
corpus.head(3)

analyzer = SentimentIntensityAnalyzer()

# Try VADER on a few hand-crafted sentences first to build intuition
test_sentences = [
    "The economy is growing strongly and unemployment is low.",
    "Inflation is unacceptably high and risks remain to the upside.",
    "Growth is not as weak as feared, but uncertainty remains.",
    "The Governing Council decided to keep interest rates unchanged.",
]

print(f"{'compound':>10} {'pos':>6} {'neu':>6} {'neg':>6}   sentence")
print("-" * 90)
for s in test_sentences:
    sc = analyzer.polarity_scores(s)
    print(f"{sc['compound']:>10.3f} {sc['pos']:>6.3f} {sc['neu']:>6.3f} {sc['neg']:>6.3f}   {s}")

# Apply VADER to the entire corpus
def vader_scores(text):
    sc = analyzer.polarity_scores(str(text))
    return pd.Series({
        "vader_compound": sc["compound"],
        "vader_pos":      sc["pos"],
        "vader_neg":      sc["neg"],
        "vader_neu":      sc["neu"],
    })

vader_df = corpus["body_clean"].apply(vader_scores)
corpus = pd.concat([corpus, vader_df], axis=1)

print(corpus[["date_parsed","vader_compound","vader_pos","vader_neg"]].to_string(index=False))

# Plot VADER compound sentiment over time
fig, ax = plt.subplots(figsize=(11, 4))
ax.plot(corpus["date_parsed"], corpus["vader_compound"],
        marker="o", color="steelblue", linewidth=2, markersize=6)
ax.fill_between(corpus["date_parsed"], corpus["vader_compound"], 0,
                where=corpus["vader_compound"] >= 0, color="steelblue", alpha=0.2)
ax.fill_between(corpus["date_parsed"], corpus["vader_compound"], 0,
                where=corpus["vader_compound"] < 0, color="tomato", alpha=0.2)
ax.axhline(0, color="black", linewidth=0.6)
ax.set_title("ECB sentiment over time (VADER compound)", loc="left", fontsize=12)
ax.set_ylabel("VADER compound score (−1 to +1)")
ax.set_xlabel("")
ax.grid(alpha=0.3)
fig.autofmt_xdate()
fig.tight_layout()
plt.show()

# Lightweight LM Negative word list (selected subset for demonstration).
# For research, always use the full dictionary from the official source:
#   https://sraf.nd.edu/loughranmcdonald-master-dictionary/
LM_NEGATIVE = {
    "abandoned","adverse","affect","against","allegations","bankruptcy",
    "breach","burden","cease","challenges","claims","concerns","conflicts",
    "constraints","costs","damages","decline","default","deficiency",
    "delay","difficulty","dispute","doubt","downturn","exposure",
    "fail","failure","falling","fraud","harmful","impair","impose","instability",
    "lack","lawsuit","litigation","loss","losses","negative",
    "objection","obstacle","plaintiff","problem","recession","reject",
    "restraint","restructuring","risk","risks","slowdown","stagnation",
    "termination","threat","unfavorable","unstable","volatility",
    "vulnerability","weakness","worsen","worsening",
}

def lm_negative_share(text):
    """Return the share of LM-negative words in text."""
    words = re.findall(r"\b[a-z]+\b", str(text).lower())
    if not words:
        return 0.0
    neg_count = sum(1 for w in words if w in LM_NEGATIVE)
    return neg_count / len(words)

corpus["lm_neg_score"] = corpus["body_clean"].apply(lm_negative_share)
print(corpus[["date_parsed","lm_neg_score","vader_compound"]].round(4).to_string(index=False))

# Compare VADER vs LM — do they agree?
fig, axes = plt.subplots(1, 2, figsize=(13, 4))

# VADER compound over time
axes[0].plot(corpus["date_parsed"], corpus["vader_compound"],
             marker="o", color="steelblue", linewidth=2)
axes[0].axhline(0, color="black", linewidth=0.6)
axes[0].set_title("VADER compound score", fontsize=11, loc="left")
axes[0].set_xlabel("Date"); axes[0].set_ylabel("Score (−1 to +1)")
axes[0].grid(alpha=0.3)

# LM negative share over time
axes[1].plot(corpus["date_parsed"], corpus["lm_neg_score"],
             marker="o", color="tomato", linewidth=2)
axes[1].set_title("Loughran-McDonald negative share", fontsize=11, loc="left")
axes[1].set_xlabel("Date"); axes[1].set_ylabel("Share of LM-negative words")
axes[1].grid(alpha=0.3)

for ax in axes:
    ax.tick_params(axis="x", rotation=30)
fig.tight_layout()
plt.show()

# Correlation between sentiment and year — does tone shift across the ECB tightening / easing cycle?
corr_df = corpus[["year","vader_compound","lm_neg_score"]].dropna()
print("Correlation matrix:")
print(corr_df.corr().round(3))
print(f"\nLM-negative vs VADER compound correlation: {corpus[['vader_compound','lm_neg_score']].corr().iloc[0,1]:.3f}")

# Simple scatter coloured by LM negative share
fig, ax = plt.subplots(figsize=(7, 4))
sc = ax.scatter(corpus["year"], corpus["vader_compound"],
                c=corpus["lm_neg_score"], cmap="RdYlGn_r", s=80, zorder=3)
plt.colorbar(sc, ax=ax, label="LM negative share")
ax.axhline(0, color="black", linewidth=0.6)
ax.set_xlabel("Year"); ax.set_ylabel("VADER compound")
ax.set_title("Tone over the policy cycle (point colour = LM negative share)", loc="left", fontsize=11)
ax.grid(alpha=0.3)
fig.tight_layout()
plt.show()

# Hand-labelled training set: 30 short policy-style statements, balanced.
# In a real project this would be 500-5000 documents, often crowdsourced.
training_data = pd.DataFrame({
    "text": [
        # POSITIVE / dovish-supportive (15)
        "Inflation is converging to target and growth has resumed.",
        "Economic activity is expanding broadly across sectors.",
        "Labour market conditions remain robust and wage growth is moderating.",
        "Financial conditions have eased and credit growth is recovering.",
        "Inflation expectations are well anchored at the medium-term target.",
        "The euro area economy is showing increasing resilience.",
        "Underlying inflation has continued to ease, supporting the disinflationary path.",
        "Confidence indicators improved and consumer spending strengthened.",
        "The transmission of monetary policy is working effectively.",
        "Risks to the outlook are broadly balanced.",
        "Activity has picked up and the recovery is broadening.",
        "Wages are growing in line with productivity gains.",
        "Trade flows have recovered after recent disruptions.",
        "Investment indicators are turning positive across major economies.",
        "Bank lending conditions have normalised.",
        # NEGATIVE / hawkish-concerned (15)
        "Inflation remains too high and risks are to the upside.",
        "Geopolitical tensions are weighing on the outlook.",
        "Financial conditions have tightened sharply.",
        "Consumer confidence has deteriorated and demand is weakening.",
        "The risk of a recession has materially increased.",
        "Wage pressures are persistent and could de-anchor expectations.",
        "Inflation expectations show signs of becoming unanchored.",
        "Trade tensions are creating significant downside risks.",
        "Bank lending standards have tightened materially.",
        "Energy prices are exerting renewed upward pressure on inflation.",
        "Manufacturing output has contracted for several months.",
        "External demand has weakened substantially.",
        "Credit risk premia have risen across the euro area.",
        "Core inflation remains elevated and sticky.",
        "Productivity growth has slowed, raising concerns about potential output.",
    ],
    "label": ["positive"] * 15 + ["negative"] * 15,
})

print(f"Training set: {len(training_data)} statements ({(training_data['label']=='positive').sum()} positive, {(training_data['label']=='negative').sum()} negative)")
training_data.head()

# Train/test split — 70/30, stratified so each set keeps class balance
X_train, X_test, y_train, y_test = train_test_split(
    training_data["text"],
    training_data["label"],
    test_size=0.30,
    random_state=42,
    stratify=training_data["label"],
)

print(f"Training set: {len(X_train)} ({(y_train=='positive').sum()} positive, {(y_train=='negative').sum()} negative)")
print(f"Test set:     {len(X_test)} ({(y_test=='positive').sum()} positive, {(y_test=='negative').sum()} negative)")

# Build a pipeline: TF-IDF + Multinomial Naive Bayes
classifier = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=1),
    MultinomialNB(alpha=0.5),
)

# Train (fit) on the training set only
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Report basic accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.3f} ({(y_pred == y_test).sum()}/{len(y_test)})")

challenge_reviews = [
    "The identification strategy is clever but the empirical implementation is weak.",
    "This is a paper I will cite — the instrument is novel and the results robust.",
    "I found the theoretical framework compelling, though the data section could be more detailed.",
    "While the authors address an important question, the paper feels underpowered.",
    "A well-executed study with policy-relevant findings and careful robustness checks.",
]

# Step 1 — class vote (no model output yet)
print("REVIEW BY REVIEW. Vote: P (positive) or N (negative). No conferring.\n")
for i, r in enumerate(challenge_reviews, 1):
    print(f"{i}. {r}")
    print("   Class tally: P = ___ , N = ___\n")

# Step 2 — run the model trained earlier in §6.A
preds  = classifier.predict(challenge_reviews)
probs  = classifier.predict_proba(challenge_reviews)
classes = classifier.classes_

print(f"{'#':<3} {'Pred':<10} {'P(positive)':<13} Review")
print("-" * 100)
for i, (r, p, pr) in enumerate(zip(challenge_reviews, preds, probs), 1):
    p_pos = pr[list(classes).index("positive")]
    print(f"{i:<3} {p:<10} {p_pos:>10.2f}    {r[:70]}")

# 5-fold stratified cross-validation on the FULL training data
# (no separate test set this time — CV does the splitting internally)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(classifier, training_data["text"], training_data["label"],
                             cv=cv, scoring="accuracy")

print(f"Per-fold accuracy: {[f'{s:.3f}' for s in cv_scores]}")
print(f"Mean accuracy:     {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

# Per-class metrics on the held-out test set
print(classification_report(y_test, y_pred, digits=3))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=["negative", "positive"])
print("Confusion matrix:")
print(f"               pred_neg  pred_pos")
print(f"  true_neg     {cm[0,0]:>8}  {cm[0,1]:>8}")
print(f"  true_pos     {cm[1,0]:>8}  {cm[1,1]:>8}")

from sklearn.decomposition import TruncatedSVD

# Re-use the TF-IDF matrix logic from L7. Build it fresh here so the cell is self-contained.
tfidf = TfidfVectorizer(max_features=500, min_df=2, stop_words="english",
                        token_pattern=r"\b[a-zA-Z]{3,}\b", sublinear_tf=True)
X_tfidf = tfidf.fit_transform(corpus["body_clean"])

# LSA: keep top 5 singular components (K must be < min(n_docs, n_features)).
# In practice with 50+ docs you'd use K=50-100.
lsa = TruncatedSVD(n_components=4, random_state=42)
X_lsa = lsa.fit_transform(X_tfidf)

print(f"TF-IDF matrix:  {X_tfidf.shape} (sparse, {X_tfidf.nnz} non-zero / {X_tfidf.shape[0]*X_tfidf.shape[1]} cells)")
print(f"LSA matrix:     {X_lsa.shape} (dense)")
print(f"\nVariance explained by 4 components: {lsa.explained_variance_ratio_.sum():.1%}")
print(f"Per-component:  {[f'{v:.1%}' for v in lsa.explained_variance_ratio_]}")

# Inspect the top words for each latent component
print(f"\nTop 8 terms per latent component:")
vocab = tfidf.get_feature_names_out()
for k, comp in enumerate(lsa.components_):
    top_idx = np.argsort(np.abs(comp))[-8:][::-1]
    top = [(vocab[i], comp[i]) for i in top_idx]
    print(f"  Component {k}: " + ", ".join(f"{w}({s:+.2f})" for w,s in top))

import gensim.downloader as api
glove = api.load('glove-wiki-gigaword-100')  # 134 MB, downloads on first use

import urllib.request
import gzip
import shutil
from pathlib import Path
from gensim.models import KeyedVectors

# Target directory next to the notebook
EMB_DIR = Path.cwd() / "embeddings"
EMB_DIR.mkdir(exist_ok=True)

# Models to download: name -> URL on the gensim-data GitHub release mirror
MODELS = {
    "glove-wiki-gigaword-50":  "https://github.com/piskvorky/gensim-data/releases/download/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz",
    "glove-wiki-gigaword-100": "https://github.com/piskvorky/gensim-data/releases/download/glove-wiki-gigaword-100/glove-wiki-gigaword-100.gz",
}

def get_glove(name: str, url: str) -> KeyedVectors:
    """Download (if needed), decompress, and load a GloVe model from the gensim-data mirror.
    Note: files on gensim-data are already in word2vec format (with header on the first line)."""
    gz_path  = EMB_DIR / f"{name}.gz"
    txt_path = EMB_DIR / f"{name}.txt"

    if not gz_path.exists() and not txt_path.exists():
        print(f"Downloading {name} to {gz_path} (may take 1-2 minutes)...")
        urllib.request.urlretrieve(url, gz_path)
        print("Download complete.")

    if not txt_path.exists():
        print(f"Decompressing {name}...")
        with gzip.open(gz_path, "rb") as f_in, open(txt_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
        gz_path.unlink()

    print(f"Loading {name}...")
    return KeyedVectors.load_word2vec_format(txt_path, binary=False)

glove_50 = get_glove("glove-wiki-gigaword-50",  MODELS["glove-wiki-gigaword-50"])
glove    = get_glove("glove-wiki-gigaword-100", MODELS["glove-wiki-gigaword-100"])

print(f"\nglove-50  loaded:  {len(glove_50.key_to_index):,} words × {glove_50.vector_size}-dim")
print(f"glove-100 loaded: {len(glove.key_to_index):,} words × {glove.vector_size}-dim")

print("\n--- Most similar to 'inflation' (top 5) ---")
for w, s in glove.most_similar("inflation", topn=5):
    print(f"  {w:<20} {s:.3f}")

print("\n--- Word analogy: king - man + woman ≈ ? ---")
result = glove.most_similar(positive=["king", "woman"], negative=["man"], topn=3)
for w, s in result:
    print(f"  {w:<20} {s:.3f}")

print("\n--- Word analogy: euro - europe + japan ≈ ? ---")
result = glove.most_similar(positive=["euro", "japan"], negative=["europe"], topn=3)
for w, s in result:
    print(f"  {w:<20} {s:.3f}")

# Document embeddings via mean-pooling: average word vectors of a document.
# Words not in GloVe vocabulary are skipped.
def document_vector(text, model):
    words = re.findall(r"\b[a-z]+\b", str(text).lower())
    vectors = [model[w] for w in words if w in model.key_to_index]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Embed each ECB document
doc_vectors = np.vstack([document_vector(t, glove) for t in corpus["body_clean"]])
print(f"Document embeddings shape: {doc_vectors.shape}")

# Cosine similarity matrix using dense embeddings
from sklearn.metrics.pairwise import cosine_similarity
sim_glove = cosine_similarity(doc_vectors)

print("\nCosine similarity (GloVe-averaged document vectors):")
labels = corpus["date_parsed"].dt.strftime("%Y-%m-%d").values
print(f"             {'  '.join(l for l in labels)}")
for i, l in enumerate(labels):
    row = "  ".join(f"{sim_glove[i,j]:.3f}     " for j in range(len(labels)))
    print(f"  {l}  {row}")

import os, urllib.request

SOTU_URL = "https://raw.githubusercontent.com/sekhansen/text-mining-tutorial/master/speech_data_extend.txt"
SOTU_PATH = "speech_data_extend.txt"

if not os.path.exists(SOTU_PATH):
    print(f"Downloading SOTU dataset (~12 MB)...")
    urllib.request.urlretrieve(SOTU_URL, SOTU_PATH)
    print("Done.")

# Load: tab-separated, columns are [president, speech, year], with header row
sotu = pd.read_csv(SOTU_PATH, sep="\t", quoting=3, on_bad_lines="skip")
sotu = sotu.dropna(subset=["speech", "year"])
sotu["year"] = sotu["year"].astype(int)

# Restrict to a manageable post-WWII window for speed (LDA on 23k paragraphs is slow)
sotu_modern = sotu[sotu["year"] >= 1947].reset_index(drop=True)
print(f"SOTU paragraphs (1947+): {len(sotu_modern):,}")
print(f"Speeches (year × president): {sotu_modern.groupby(['year','president']).ngroups}")
print(f"Year range: {sotu_modern['year'].min()}–{sotu_modern['year'].max()}")
sotu_modern.head(3)

# Fit LDA at the PARAGRAPH level — Hansen's key methodological choice.
# Each paragraph is treated as one document. Topics are estimated over paragraphs.

# Filter very short paragraphs (procedural noise)
sotu_paragraphs = sotu_modern[sotu_modern["speech"].str.split().str.len() >= 30].reset_index(drop=True)
print(f"Paragraphs after filtering (≥30 words): {len(sotu_paragraphs):,}")

# Vectorise: BoW counts, basic English stopwords, drop very rare and very common words
vec_sotu = CountVectorizer(stop_words="english", min_df=10, max_df=0.5,
                           token_pattern=r"\b[a-zA-Z]{4,}\b", max_features=3000)
X_sotu = vec_sotu.fit_transform(sotu_paragraphs["speech"])
print(f"Doc-term matrix: {X_sotu.shape}")

# Fit LDA with K=15 topics. (Hansen uses K=40 for FOMC; we use 15 for speed and clarity.)
from sklearn.decomposition import LatentDirichletAllocation
K = 15
print(f"\nFitting LDA with K={K}...")
lda_sotu = LatentDirichletAllocation(n_components=K, random_state=42, max_iter=15,
                                     learning_method="online", n_jobs=-1)
lda_sotu.fit(X_sotu)
print("Done.")

# Display top words per topic — let economic interpretation come from the loadings
vocab_sotu = vec_sotu.get_feature_names_out()
print("\nTop 10 words per topic:")
for k in range(K):
    top = lda_sotu.components_[k].argsort()[-10:][::-1]
    print(f"  Topic {k:2d}: " + ", ".join(vocab_sotu[i] for i in top))

# Aggregate paragraph-level topic shares to speech (year-president) level.
# This is the core Hansen et al. step: we want a topic distribution per speech, not per paragraph.

paragraph_topics = lda_sotu.transform(X_sotu)  # shape (n_paragraphs, K)
print(f"Paragraph topic matrix: {paragraph_topics.shape}")

# Assign each paragraph back to its speech
sotu_paragraphs = sotu_paragraphs.copy()
for k in range(K):
    sotu_paragraphs[f"topic_{k}"] = paragraph_topics[:, k]

# Average topic shares within each speech
speech_topics = (sotu_paragraphs
                 .groupby(["year", "president"])[[f"topic_{k}" for k in range(K)]]
                 .mean()
                 .reset_index()
                 .sort_values("year"))
print(f"\nSpeech-level topic shares: {speech_topics.shape}")
speech_topics.head()

# Plot the time series of selected topics — pick those that are most interpretable
# from the top-words above. The indices here match the random_state=42 run; if you
# re-run with a different seed or sklearn version, inspect the topics and update.

# Topic 6: defense/military; 11: foreign policy; 4: healthcare; 9: budget/fiscal
selected_topics = [6, 11, 4, 9]

fig, ax = plt.subplots(figsize=(12, 5))
for k in selected_topics:
    label_words = ", ".join(vocab_sotu[i] for i in lda_sotu.components_[k].argsort()[-3:][::-1])
    ax.plot(speech_topics["year"], speech_topics[f"topic_{k}"],
            marker="o", markersize=3, linewidth=1.5,
            label=f"Topic {k}: {label_words}")

ax.set_xlabel("Year"); ax.set_ylabel("Topic share (mean over paragraphs)")
ax.set_title("U.S. State of the Union — topic shares over time (Hansen-style)",
             loc="left", fontsize=12)
ax.legend(loc="best", fontsize=9)
ax.grid(alpha=0.3)
fig.tight_layout()
plt.show()

# Task 1 — VADER and LM sentiment
# YOUR CODE HERE

# Task 2 — Supervised classifier
# YOUR CODE HERE

# Task 3 — Compare classifier vs VADER
# YOUR CODE HERE

# Task 4 (optional) — Logistic Regression vs Naive Bayes
# YOUR CODE HERE

# ── SOLUTION ──────────────────────────────────────────────────────────────────
import pandas as pd, numpy as np, matplotlib.pyplot as plt, re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ── Task 1 ────────────────────────────────────────────────────────────────────
analyzer = SentimentIntensityAnalyzer()
LM_NEGATIVE = {  # same set as §4
    "abandoned","adverse","affect","against","allegations","bankruptcy","breach","burden",
    "cease","challenges","claims","concerns","conflicts","constraints","costs","damages",
    "decline","default","deficiency","delay","difficulty","dispute","doubt","downturn",
    "exposure","fail","failure","falling","fraud","harmful","impair","impose","instability",
    "lack","lawsuit","litigation","loss","losses","negative","objection","obstacle",
    "plaintiff","problem","recession","reject","restraint","restructuring","risk","risks",
    "slowdown","stagnation","termination","threat","unfavorable","unstable","volatility",
    "vulnerability","weakness","worsen","worsening",
}

def lm_neg(text):
    words = re.findall(r"\b[a-z]+\b", str(text).lower())
    return sum(1 for w in words if w in LM_NEGATIVE) / max(len(words), 1)

corpus["vader"] = corpus["body_clean"].apply(lambda x: analyzer.polarity_scores(str(x))["compound"])
corpus["lm_neg"] = corpus["body_clean"].apply(lm_neg)

print(f"Correlation between VADER and LM-neg: {corpus[['vader','lm_neg']].corr().iloc[0,1]:.3f}")
print("\nMost positive (VADER):"); print(corpus.nlargest(2, "vader")[["date_parsed","vader"]].to_string(index=False))
print("\nMost negative (LM):");    print(corpus.nlargest(2, "lm_neg")[["date_parsed","lm_neg"]].to_string(index=False))

fig, ax1 = plt.subplots(figsize=(11, 4))
ax2 = ax1.twinx()
ax1.plot(corpus["date_parsed"], corpus["vader"], "o-", color="steelblue", label="VADER")
ax2.plot(corpus["date_parsed"], corpus["lm_neg"], "s-", color="tomato", label="LM-neg")
ax1.set_ylabel("VADER", color="steelblue"); ax2.set_ylabel("LM-neg", color="tomato")
ax1.set_title("Sentiment over time", loc="left")
fig.tight_layout(); plt.show()

# ── Task 2 ────────────────────────────────────────────────────────────────────
X_train, X_test, y_train, y_test = train_test_split(
    training_data["text"], training_data["label"],
    test_size=0.30, random_state=42, stratify=training_data["label"])

clf_lr = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=1),
    LogisticRegression(max_iter=1000),
)
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)
print(f"\nLogistic Regression — test accuracy: {accuracy_score(y_test, y_pred_lr):.3f}")
print(classification_report(y_test, y_pred_lr, digits=3))
cv_scores = cross_val_score(clf_lr, training_data["text"], training_data["label"],
                            cv=StratifiedKFold(5, shuffle=True, random_state=42))
print(f"5-fold CV: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

# ── Task 3 ────────────────────────────────────────────────────────────────────
corpus["clf_pred"]   = clf_lr.predict(corpus["body_clean"])
corpus["vader_pred"] = np.where(corpus["vader"] > 0, "positive", "negative")
agreement = (corpus["clf_pred"] == corpus["vader_pred"]).mean()
print(f"\nClassifier vs VADER agreement on corpus: {agreement:.2%}")
print(pd.crosstab(corpus["clf_pred"], corpus["vader_pred"], rownames=["classifier"], colnames=["VADER"]))

# ── Task 4 (optional) ─────────────────────────────────────────────────────────
clf_nb = make_pipeline(TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=1),
                      MultinomialNB(alpha=0.5))
cv_nb = cross_val_score(clf_nb, training_data["text"], training_data["label"],
                        cv=StratifiedKFold(5, shuffle=True, random_state=42))
print(f"\nNaive Bayes 5-fold CV: {cv_nb.mean():.3f} ± {cv_nb.std():.3f}")
print(f"Logistic 5-fold CV:    {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

Method	Library	Best for	Limits
VADER	`vaderSentiment`	General text, quick exploration	Domain mismatch, ceiling on long text
Loughran-McDonald	Manual / CSV	Financial / economic text	Word-list rigidity, no negation
Naive Bayes	`sklearn.naive_bayes`	Baseline supervised classifier	Independence assumption, smoothing-sensitive
Logistic Regression	`sklearn.linear_model`	Default supervised classifier	Needs labelled data
LSA (TruncatedSVD)	`sklearn.decomposition`	Dense doc representation, fast	Linear, lossy, requires re-fit on new corpora
GloVe (pre-trained)	`gensim.downloader`	Word similarity, OOV handling	Static — one vector per word regardless of context
LDA pipeline (Hansen-style)	`sklearn.decomposition`	Topic shares over time, replicates QJE 2018	Needs many paragraphs, K is a judgment call

Lecture 8 — Sentiment Analysis & Supervised ML for Text¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

0. Setup — packages required for this lecture¶

1. Warm-up — a spam classifier in 10 lines, judged by you¶

Now let's test it¶

2. Sentiment analysis: dictionary-based approaches¶

3. VADER¶

4. Loughran-McDonald dictionary¶

5. Sentiment as an economic variable¶

6. Supervised classification of text¶

Train/test split — the most important step¶

⏱ Ten-minute challenge — Human vs machine¶

1 (`clever but weak`) — splits roughly 50/50. The "but" is the signal; readers anchor on whichever side they read last.¶

2 (`I will cite — novel, robust`) — near-unanimous positive.¶

3 (`compelling, though could be more detailed`) — splits, with most leaning positive.¶

4 (`important question, underpowered`) — splits, with most leaning negative.¶

5 (`well-executed, policy-relevant, careful`) — near-unanimous positive.¶

Back to §6 — now that you've seen the model fail, let's evaluate it properly¶

Cross-validation — accuracy is noisy on a single split¶

Beyond accuracy — precision, recall, F1, confusion matrix¶

7. From sparse to dense — Latent Semantic Analysis¶

8. Pre-trained word embeddings¶

9. Hansen-style replication — topic shares over time¶

10. Exercise — to do at home¶

Summary¶

Workflow that matters more than the model¶

Next lecture¶

Lecture 8 — Sentiment Analysis & Supervised ML for Text¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

0. Setup — packages required for this lecture¶

1. Warm-up — a spam classifier in 10 lines, judged by you¶

Now let's test it¶

2. Sentiment analysis: dictionary-based approaches¶

3. VADER¶

4. Loughran-McDonald dictionary¶

5. Sentiment as an economic variable¶

6. Supervised classification of text¶

Train/test split — the most important step¶

⏱ Ten-minute challenge — Human vs machine¶

1 (clever but weak) — splits roughly 50/50. The "but" is the signal; readers anchor on whichever side they read last.¶

2 (I will cite — novel, robust) — near-unanimous positive.¶

3 (compelling, though could be more detailed) — splits, with most leaning positive.¶

4 (important question, underpowered) — splits, with most leaning negative.¶

5 (well-executed, policy-relevant, careful) — near-unanimous positive.¶

Back to §6 — now that you've seen the model fail, let's evaluate it properly¶

Cross-validation — accuracy is noisy on a single split¶

Beyond accuracy — precision, recall, F1, confusion matrix¶

7. From sparse to dense — Latent Semantic Analysis¶

8. Pre-trained word embeddings¶

9. Hansen-style replication — topic shares over time¶

10. Exercise — to do at home¶

Summary¶

Workflow that matters more than the model¶

Next lecture¶

1 (`clever but weak`) — splits roughly 50/50. The "but" is the signal; readers anchor on whichever side they read last.¶

2 (`I will cite — novel, robust`) — near-unanimous positive.¶

3 (`compelling, though could be more detailed`) — splits, with most leaning positive.¶

4 (`important question, underpowered`) — splits, with most leaning negative.¶

5 (`well-executed, policy-relevant, careful`) — near-unanimous positive.¶