conda activate your_environment
conda install -c conda-forge nltk spacy scikit-learn wordcloud
python -m spacy download en_core_web_sm

# Section 0 in the markdown above explains the one-time terminal setup.
# The nltk.download() calls below populate user-level data and are idempotent.

import pandas as pd
import numpy as np
import re, string
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from pathlib import Path # object-oriented path handling: `Path.cwd()` returns the current working directory, and `/` joins path components in a cross-platform way

NLTK_DIR = str(Path.cwd() / "nltk_data")
nltk.data.path.insert(0, NLTK_DIR)

for pkg in ("punkt_tab", "stopwords", "wordnet"):
    nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy
nlp = spacy.load("en_core_web_sm")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

# Try the L6 corpus first; if it's too small for a multi-year sentiment chart,
# fall back to a 25-year synthetic series (random templates) so the plot is illustrative.
try:
    _l6 = pd.read_csv("corpus_clean.csv")
    _l6["date"] = pd.to_datetime(_l6["date_parsed"])
    _l6 = _l6.rename(columns={"body_clean": "text"})[["date", "text"]].dropna()
    if len(_l6) >= 30 and _l6["date"].dt.year.nunique() >= 5:
        warmup_corpus = _l6
        warmup_source = "L6 corpus (real)"
    else:
        raise ValueError("L6 corpus too small for time-series demo")
except Exception:
    np.random.seed(0)
    dates = pd.date_range("2000-01-01", "2024-12-31", freq="MS")
    templates = [
        "The outlook is strong, with inflation expectations well anchored and growth robust.",
        "Economic conditions remain fragile, with significant downside risks to growth and financial stability.",
        "Prices are stable, labour markets are healthy, and the recovery is broadening.",
        "Uncertainty is elevated; inflation is persistent and confidence in the outlook has deteriorated.",
    ]
    warmup_corpus = pd.DataFrame({
        "date": dates,
        "text": np.random.choice(templates, size=len(dates)),
    })
    warmup_source = "synthetic 25-year fallback (random templates)"

print(f"Warm-up corpus: {len(warmup_corpus):,} documents — source: {warmup_source}")
warmup_corpus.head()

# Dictionary-based sentiment, simplified Loughran-McDonald style
POSITIVE = {"strong", "robust", "healthy", "stable", "recovery", "growth", "confident",
            "improved", "sound", "resilient", "expansion", "anchored", "broadening"}
NEGATIVE = {"fragile", "risks", "downturn", "deteriorated", "uncertainty", "weak",
            "persistent", "tightening", "contraction", "subdued", "slowdown", "elevated"}

def sentiment(text):
    words = str(text).lower().split()
    pos = sum(w.strip(".,") in POSITIVE for w in words)
    neg = sum(w.strip(".,") in NEGATIVE for w in words)
    total = pos + neg
    return (pos - neg) / total if total else 0.0

warmup_corpus["sentiment"] = warmup_corpus["text"].apply(sentiment)
monthly = warmup_corpus.set_index("date")["sentiment"].resample("ME").mean()

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(monthly.index, monthly.rolling(6).mean(), linewidth=2, color="#1f77b4")
ax.axhline(0, color="black", linewidth=0.5)
for year, label in [(2008, "GFC"), (2012, "EZ crisis"), (2020, "COVID"), (2022, "Energy")]:
    ax.axvline(pd.Timestamp(f"{year}-01-01"), color="grey", alpha=0.3, linestyle="--")
    ax.text(pd.Timestamp(f"{year}-06-01"), ax.get_ylim()[1] * 0.85, label,
            fontsize=8, color="grey")
ax.set_title("ECB communication sentiment (6-month MA)", fontsize=12, loc="left")
ax.set_ylabel("Sentiment score")
ax.set_xlabel("")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.tight_layout()
plt.show()

# Load the clean corpus from L6
try:
    corpus = pd.read_csv("corpus_clean.csv")
    corpus["date_parsed"] = pd.to_datetime(corpus["date_parsed"], errors="coerce")
    corpus["year"] = corpus["date_parsed"].dt.year
    print(f"Corpus loaded: {len(corpus)} documents")
except FileNotFoundError:
    # Synthetic fallback: 8 short ECB-style monetary-policy statements (2022–2024)
    corpus = pd.DataFrame({
        "date_parsed": pd.to_datetime(["2024-01-25","2023-10-26","2023-09-14",
                                        "2022-12-15","2022-10-27","2022-09-08",
                                        "2024-04-11","2024-09-12"]),
        "title": ["Monetary policy decisions"] * 8,
        "body_clean": [
            "The Governing Council today decided to keep the three key ECB interest rates unchanged. "
            "Inflation is projected to decline gradually but will remain above the 2% target.",
            "The Governing Council kept rates unchanged. Underlying inflation eased and price pressures remain strong.",
            "The Governing Council raised the three key ECB interest rates by 25 basis points to ensure timely return of inflation to target.",
            "The Governing Council raised rates by 50 basis points. Interest rates will rise significantly to reach restrictive levels.",
            "The Governing Council raised the deposit facility rate by 75 basis points to tackle persistent inflation pressures.",
            "The Governing Council raised the deposit facility rate by 75 basis points to ensure inflation returns to the 2% medium-term target.",
            "Incoming data confirm the inflation outlook. Wage growth is moderating and financing conditions remain restrictive.",
            "The Governing Council lowered the deposit facility rate by 25 basis points based on the updated inflation outlook.",
        ],
    })
    corpus["year"] = corpus["date_parsed"].dt.year
    print(f"Using synthetic corpus ({len(corpus)} documents).")
corpus.head(3)

# Sample document
doc = corpus["body_clean"].iloc[0]
print("Original:")
print(doc[:200])
print()

# Step 1: Sentence splitting
sentences = sent_tokenize(doc)
print(f"Sentences ({len(sentences)}):")
for s in sentences[:3]:
    print(f"  {s}")

# Step 2: Word tokenisation
tokens = word_tokenize(doc.lower())
print(f"Total tokens: {len(tokens)}")
print("First 20:", tokens[:20])

# Step 3: Remove punctuation and non-alphabetic tokens
tokens_clean = [t for t in tokens if t.isalpha()]
print(f"After punctuation removal: {len(tokens_clean)} tokens")

# Step 4: Remove stopwords
stop_en = set(stopwords.words("english"))
# Add domain-specific stopwords — words that appear everywhere and carry no signal
domain_stop = {"per", "cent", "ecb", "governing", "council", "decided", "december", "march"}
stop_en.update(domain_stop)

tokens_no_stop = [t for t in tokens_clean if t not in stop_en]
print(f"After stopword removal: {len(tokens_no_stop)} tokens")
print("First 20:", tokens_no_stop[:20])

# Step 5a: Stemming — aggressive, language-agnostic
stemmer = PorterStemmer()
stems = [stemmer.stem(t) for t in tokens_no_stop]
print("Stemmed:", stems[:20])
print()
# Notice: 'inflation' → 'inflat', 'rates' → 'rate', 'economy' → 'economi'
# Stemming is fast but produces non-words — fine for BoW, bad for display

# Step 5b: Lemmatisation — linguistically principled, produces real words
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in tokens_no_stop]
print("Lemmatised:", lemmas[:20])
print()
# 'rates' → 'rate', 'projected' → 'projected' (needs POS for best results)

# spaCy processes a document in one call
doc_spacy = nlp(corpus["body_clean"].iloc[0])

print(f"{'Token':<20} {'Lemma':<20} {'POS':<8} {'Stop?'}")
print("-" * 60)
for token in list(doc_spacy)[:20]:
    print(f"{token.text:<20} {token.lemma_:<20} {token.pos_:<8} {token.is_stop}")

# Named entity recognition — useful for extracting actors, locations, dates
print("Named entities:")
for ent in doc_spacy.ents[:15]:
    print(f"  {ent.text!r:<30} [{ent.label_}]")

# Build a cleaned token list using spaCy
def spacy_tokens(text, min_len=3):
    """Tokenise with spaCy, remove stopwords, punctuation, and short tokens.
    Returns a list of lowercase lemmas."""
    doc = nlp(text)
    return [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop
        and not token.is_punct
        and not token.is_space
        and token.is_alpha
        and len(token.text) >= min_len
    ]

sample_tokens = spacy_tokens(corpus["body_clean"].iloc[0])
print(f"Tokens: {len(sample_tokens)}")
print("First 20:", sample_tokens[:20])

# Apply to entire corpus — can take a moment for large corpora
# spaCy's pipe() processes in batches (much faster than calling nlp() in a loop)
corpus["tokens"] = [
    spacy_tokens(text)
    for text in corpus["body_clean"]
]
corpus["n_tokens"] = corpus["tokens"].str.len()

print(f"Avg tokens per document: {corpus['n_tokens'].mean():.0f}")
corpus[["date_parsed","n_tokens","tokens"]].head(3)

# scikit-learn vectorisers work on raw strings — they handle tokenisation internally
# We pass our cleaned text (body_clean) and let the vectoriser do the rest

# Bag-of-Words
bow_vec = CountVectorizer(
    max_features=500,      # keep only the 500 most frequent terms
    min_df=2,              # ignore terms that appear in fewer than 2 docs
    stop_words="english",
    token_pattern=r"\b[a-zA-Z]{3,}\b",  # only alphabetic tokens >= 3 chars
)
bow_matrix = bow_vec.fit_transform(corpus["body_clean"])
print(f"BoW matrix shape: {bow_matrix.shape}")
print(f"Vocabulary size: {len(bow_vec.get_feature_names_out())}")
print(f"Sample vocab (first 20): {list(bow_vec.get_feature_names_out())[:20]}")

# Top 10 most frequent terms across the corpus
freq = bow_matrix.sum(axis=0).A1
vocab = bow_vec.get_feature_names_out()
top_freq = sorted(zip(vocab, freq), key=lambda x: -x[1])[:10]
print(f"Top 10 by raw frequency: {top_freq}")

# TF-IDF
tfidf_vec = TfidfVectorizer(
    max_features=500,
    min_df=2,
    stop_words="english",
    token_pattern=r"\b[a-zA-Z]{3,}\b",
    sublinear_tf=True,    # use log(1 + tf) — dampens effect of very frequent terms
)
tfidf_matrix = tfidf_vec.fit_transform(corpus["body_clean"])
print(f"TF-IDF matrix: {tfidf_matrix.shape}")

# Convert to DataFrame for inspection
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vec.get_feature_names_out(),
    index=corpus["date_parsed"],
)
tfidf_df.iloc[:3, :8].round(3)

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Take the first document as a reference and compare it to every doc in the corpus
ref_idx = 0
ref_date = corpus["date_parsed"].iloc[ref_idx].strftime("%Y-%m-%d")

cos_to_ref  = cosine_similarity(tfidf_matrix[ref_idx], tfidf_matrix).flatten()
eucl_to_ref = euclidean_distances(tfidf_matrix[ref_idx], tfidf_matrix).flatten()

similarity_table = pd.DataFrame({
    "date":      corpus["date_parsed"].dt.strftime("%Y-%m-%d").values,
    "n_words":   corpus["n_words"].values,
    "cosine":    cos_to_ref.round(3),
    "euclidean": eucl_to_ref.round(3),
})
print(f"Similarity to reference document ({ref_date}):\n")
print(similarity_table.to_string(index=False))

# YOUR CODE HERE — 10 minutes
# Skeleton (Option A):
# from sklearn.feature_extraction.text import TfidfVectorizer
# vocab = tfidf_vec.get_feature_names_out()
# rows = []
# for i in range(tfidf_matrix.shape[0]):
#     top_idx = tfidf_matrix.toarray()[i].argsort()[-5:][::-1]
#     rows.append([vocab[j] for j in top_idx])
# top_terms = pd.DataFrame(rows, columns=[f"rank_{k+1}" for k in range(5)],
#                          index=corpus["date_parsed"])
# top_terms

# Skeleton (Option B):
# from sklearn.metrics.pairwise import cosine_similarity
# sim = cosine_similarity(tfidf_matrix)
# fig, ax = plt.subplots(figsize=(6, 5))
# im = ax.imshow(sim, cmap="viridis", vmin=0, vmax=1)
# labels = [d.strftime("%Y-%m-%d") for d in corpus["date_parsed"]]
# ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45, ha="right")
# ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
# plt.colorbar(im, ax=ax, label="cosine similarity")
# fig.tight_layout(); plt.show()

# ── SOLUTION (Option A) ──────────────────────────────────────────────────────
vocab = tfidf_vec.get_feature_names_out()
rows = []
for i in range(tfidf_matrix.shape[0]):
    top_idx = tfidf_matrix.toarray()[i].argsort()[-5:][::-1]
    rows.append([vocab[j] for j in top_idx])
top_terms = pd.DataFrame(rows,
                         columns=[f"rank_{k+1}" for k in range(5)],
                         index=corpus["date_parsed"])
print("Top 5 distinctive terms per document (TF-IDF):")
print(top_terms.to_string())

# ── SOLUTION (Option B) ──────────────────────────────────────────────────────
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(tfidf_matrix)

fig, ax = plt.subplots(figsize=(6, 5))
im = ax.imshow(sim, cmap="viridis", vmin=0, vmax=1)
labels = [d.strftime("%Y-%m-%d") for d in corpus["date_parsed"]]
ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
ax.set_title("Pairwise cosine similarity (TF-IDF)", loc="left")
plt.colorbar(im, ax=ax, label="cosine similarity")
fig.tight_layout(); plt.show()

# Identify most-similar and most-different pairs (excluding diagonal)
sim_no_diag = sim.copy()
np.fill_diagonal(sim_no_diag, np.nan)
i_max, j_max = np.unravel_index(np.nanargmax(sim_no_diag), sim_no_diag.shape)
i_min, j_min = np.unravel_index(np.nanargmin(sim_no_diag), sim_no_diag.shape)
print(f"\nMost similar:   {labels[i_max]} ↔ {labels[j_max]}  (cos = {sim[i_max, j_max]:.3f})")
print(f"Most different: {labels[i_min]} ↔ {labels[j_min]}  (cos = {sim[i_min, j_min]:.3f})")

# Vectorise (LDA works on raw counts, NOT TF-IDF — its generative model assumes counts)
vec_lda = CountVectorizer(
    stop_words="english", max_df=0.9, min_df=2, max_features=2000,
    token_pattern=r"\b[a-zA-Z]{3,}\b",
)
X_lda = vec_lda.fit_transform(corpus["body_clean"])

# Number of topics: rule of thumb is K ~ sqrt(n_docs), so 3 for 8 docs, 7-10 for 50-100, etc.
# With a real ECB corpus you'd want to try K ∈ {5, 8, 10, 15} and pick by coherence.
K = 5

lda = LatentDirichletAllocation(n_components=K, random_state=0, max_iter=50)
lda.fit(X_lda)

vocab_lda = vec_lda.get_feature_names_out()
print(f"Doc-term matrix: {X_lda.shape}, fitting K={K} topics\n")
for k, topic in enumerate(lda.components_):
    top = topic.argsort()[-10:][::-1]
    print(f"Topic {k}: " + ", ".join(vocab_lda[i] for i in top))

# Top terms by mean TF-IDF across all documents
mean_tfidf = tfidf_df.mean().sort_values(ascending=False)
top20 = mean_tfidf.head(20)

fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(top20.index[::-1], top20.values[::-1], color="steelblue", edgecolor="white")
ax.set_title("Top 20 terms by mean TF-IDF — ECB corpus", fontsize=12)
ax.set_xlabel("Mean TF-IDF score")
fig.tight_layout()
plt.show()

# Compare term weights across two time periods
# Hike era (2023, hike cycle still ongoing) vs cuts/holds era (2024-2025, post-pivot)
# The L6 corpus has 2 docs in 2023 and 3 docs in 2024-2025 — a 2/3 split.
corpus["period"] = corpus["year"].apply(lambda y: "≥2024 (cuts/holds)" if y >= 2024 else "≤2023 (hikes)")

groups = {}
for period, grp in corpus.groupby("period"):
    vec = TfidfVectorizer(max_features=300, min_df=1, stop_words="english",
                          token_pattern=r"\b[a-zA-Z]{3,}\b")
    mat = vec.fit_transform(grp["body_clean"])
    means = pd.Series(mat.mean(axis=0).A1, index=vec.get_feature_names_out())
    groups[period] = means.sort_values(ascending=False).head(15)

# Display side-by-side
print("Top 15 terms by mean TF-IDF, per period:\n")
comparison = pd.DataFrame({p: g.reset_index(drop=False).apply(lambda r: f"{r['index']} ({r[0]:.3f})", axis=1)
                           for p, g in groups.items()})
print(comparison.to_string(index=False))

# Word co-occurrence — which words tend to appear in the same documents?
# This is an approximation of semantic similarity using BoW
bow_small = CountVectorizer(max_features=30, min_df=2, stop_words="english",
                             token_pattern=r"\b[a-zA-Z]{4,}\b")
bow_mat   = bow_small.fit_transform(corpus["body_clean"]).toarray()
terms     = bow_small.get_feature_names_out()

# Co-occurrence matrix: (term × term), normalised
cooc = (bow_mat.T @ bow_mat).astype(float)
np.fill_diagonal(cooc, 0)

# Top co-occurring pairs
pairs = []
for i in range(len(terms)):
    for j in range(i+1, len(terms)):
        if cooc[i, j] > 0:
            pairs.append((terms[i], terms[j], cooc[i, j]))
pairs.sort(key=lambda x: -x[2])
print("Top 10 co-occurring term pairs (raw counts):")
for t1, t2, c in pairs[:10]:
    print(f"  {t1:<15} ↔ {t2:<15} count={c:.0f}")

# Build the frequency dictionary from BoW counts on the corpus we've been using.
# This uses the same preprocessing logic as §5 (alphabetic, ≥4 chars, English stopwords)
# plus a domain stopword list to suppress boilerplate.
import os, urllib.request
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image, ImageDraw, ImageFont
from scipy.ndimage import gaussian_gradient_magnitude

DOMAIN_STOP = {"per", "cent", "ecb", "governing", "council", "decided",
               "december", "march", "today", "question", "think", "said"}

bow_for_wc = CountVectorizer(max_features=300, min_df=1, stop_words="english",
                             token_pattern=r"\b[a-zA-Z]{4,}\b")
bow_wc_matrix = bow_for_wc.fit_transform(corpus["body_clean"])
freq_array = bow_wc_matrix.sum(axis=0).A1
vocab_wc = bow_for_wc.get_feature_names_out()
freq = {w: int(c) for w, c in zip(vocab_wc, freq_array) if w not in DOMAIN_STOP}

print(f"Frequency dictionary: {len(freq)} terms")
print(f"Top 10: {sorted(freq.items(), key=lambda x: -x[1])[:10]}")

wc1 = WordCloud(width=800, height=400, background_color="white",
                max_words=150, colormap="viridis", random_state=42)
wc1.generate_from_frequencies(freq)

fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wc1, interpolation="bilinear")
ax.axis("off")
ax.set_title("ECB corpus — classic wordcloud (top 150 BoW lemmas)", loc="left", fontsize=11)
fig.tight_layout()
plt.show()

def find_bold_font():
    # Cross-platform bold-font lookup via matplotlib's font manager.
    # matplotlib is already a dependency and bundles DejaVuSans-Bold as a
    # guaranteed last-resort fallback, so this never returns None.
    import matplotlib.font_manager as fm
    return fm.findfont(fm.FontProperties(family="sans-serif", weight="bold"))

def make_euro_mask(size=600):
    # Build a binary mask shaped like a € symbol via PIL.
    # Convention: white pixels (255) = masked out; dark pixels (0) = words go here.
    mask = Image.new("L", (size, size), color=255)
    draw = ImageDraw.Draw(mask)
    font = ImageFont.truetype(find_bold_font(), int(size * 0.85))
    bbox = draw.textbbox((0, 0), "\u20ac", font=font)
    w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
    pos = ((size - w) // 2 - bbox[0], (size - h) // 2 - bbox[1])
    draw.text(pos, "\u20ac", fill=0, font=font)
    return np.array(mask)

euro_mask = make_euro_mask(size=600)

wc2 = WordCloud(mask=euro_mask, background_color="white",
                max_words=300, colormap="Blues", random_state=42,
                contour_width=2, contour_color="#003299")  # ECB blue
wc2.generate_from_frequencies(freq)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(wc2, interpolation="bilinear")
ax.axis("off")
ax.set_title("ECB corpus — € mask", loc="left", fontsize=11)
fig.tight_layout()
plt.show()

# Download the parrot image from the wordcloud examples (MIT-licensed).
# After the first run it's cached locally, so re-execution is offline-friendly.
import urllib.request
PARROT_URL = ("https://raw.githubusercontent.com/amueller/word_cloud/"
              "main/examples/parrot-by-jose-mari-gimenez2.jpg")
PARROT_PATH = "parrot.jpg"
if not os.path.exists(PARROT_PATH):
    print(f"Downloading parrot image from {PARROT_URL}...")
    urllib.request.urlretrieve(PARROT_URL, PARROT_PATH)
    print("Done.")

# Load and subsample (the original is 4017x3235 — too large to render quickly)
parrot_color = np.array(Image.open(PARROT_PATH))
parrot_color = parrot_color[::3, ::3]
print(f"Parrot image after 3x subsampling: {parrot_color.shape}")

# Build the placement mask: same convention (white = excluded).
# Where the original image is pure black (sum == 0), treat it as background.
parrot_mask = parrot_color.copy()
parrot_mask[parrot_mask.sum(axis=2) == 0] = 255

# Edge detection on the colour image to enforce sharp colour boundaries —
# words near the edges of distinct colour regions are pushed inwards,
# which prevents the colours from looking washed out in the final cloud.
edges = np.mean([gaussian_gradient_magnitude(parrot_color[:, :, i] / 255., 2)
                 for i in range(3)], axis=0)
parrot_mask[edges > .08] = 255

# Build the wordcloud — note `max_words=2000` to encourage dense fill,
# and `relative_scaling=0` for the "less accurate but better-looking" trade-off.
wc3 = WordCloud(max_words=2000, mask=parrot_mask, max_font_size=40,
                random_state=42, relative_scaling=0, background_color="white")
wc3.generate_from_frequencies(freq)

# Recolour: each word inherits the colour of the pixel it lands on
image_colors = ImageColorGenerator(parrot_color)
wc3.recolor(color_func=image_colors)

fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(wc3, interpolation="bilinear")
ax.axis("off")
ax.set_title("ECB corpus — parrot-shaped image-coloured wordcloud", loc="left", fontsize=11)
fig.tight_layout()
plt.show()

# Task 1 — preprocessing pipeline
# YOUR CODE HERE

# Task 2 — TF-IDF analysis
# YOUR CODE HERE

# Task 3 — group comparison
# YOUR CODE HERE

# Task 4 (optional) — cosine similarity over time
# YOUR CODE HERE

# ── SOLUTION ──────────────────────────────────────────────────────────────────
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

corpus = pd.read_csv("corpus_clean.csv")
corpus["date_parsed"] = pd.to_datetime(corpus["date_parsed"], errors="coerce")
corpus["year"] = corpus["date_parsed"].dt.year

# ── Task 1 ────────────────────────────────────────────────────────────────────
EXTRA_STOP = {"governing", "council", "ecb", "today", "decided"}

def spacy_tokens(text):
    doc = nlp(text)
    return [t.lemma_.lower() for t in doc
            if not t.is_stop and not t.is_punct and not t.is_space
            and t.is_alpha and len(t.text) >= 3
            and t.lemma_.lower() not in EXTRA_STOP]

corpus["tokens"] = corpus["body_clean"].apply(spacy_tokens)
print(f"Mean tokens/doc: {corpus['tokens'].str.len().mean():.0f}")

# ── Task 2 ────────────────────────────────────────────────────────────────────
tfidf = TfidfVectorizer(max_features=500, min_df=2, stop_words="english",
                        token_pattern=r"\b[a-zA-Z]{3,}\b", sublinear_tf=True)
X = tfidf.fit_transform(corpus["body_clean"])
print(f"Vocabulary: {len(tfidf.get_feature_names_out())}")

mean_tfidf = pd.Series(X.toarray().mean(axis=0), index=tfidf.get_feature_names_out())
top20 = mean_tfidf.sort_values(ascending=False).head(20)

fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(top20.index[::-1], top20.values[::-1], color="steelblue")
ax.set_title("Top 20 terms by mean TF-IDF")
fig.tight_layout(); plt.show()

# ── Task 3 ────────────────────────────────────────────────────────────────────
corpus["period"] = corpus["year"].apply(lambda y: "≥2024" if y >= 2024 else "≤2023")
for period, grp in corpus.groupby("period"):
    vec = TfidfVectorizer(max_features=300, min_df=1, stop_words="english",
                          token_pattern=r"\b[a-zA-Z]{3,}\b")
    mat = vec.fit_transform(grp["body_clean"])
    means = pd.Series(mat.mean(axis=0).A1, index=vec.get_feature_names_out())
    print(f"\n{period}:", means.nlargest(15).index.tolist())

# ── Task 4 ────────────────────────────────────────────────────────────────────
sim = cosine_similarity(X)
sim_to_first = pd.Series(sim[0], index=corpus["date_parsed"]).sort_index()

fig, ax = plt.subplots(figsize=(9, 4))
ax.plot(sim_to_first.index, sim_to_first.values, marker="o", color="tomato")
ax.set_title(f"Cosine similarity to first document ({sim_to_first.index[0].date()})")
ax.set_ylabel("Cosine similarity"); ax.grid(alpha=0.3)
fig.tight_layout(); plt.show()

Step	Tool	Notes
Tokenise	`nltk.word_tokenize` / `spacy`	spaCy is more accurate
Remove stopwords	`nltk.stopwords` / `spacy is_stop`	Add domain-specific stops
Stem	`nltk.PorterStemmer`	Fast, non-words
Lemmatise	`spacy token.lemma_`	Accurate, real words
Bag-of-words	`sklearn.CountVectorizer`	Sparse matrix
TF-IDF	`sklearn.TfidfVectorizer`	Use `sublinear_tf=True`
Topic modelling	`sklearn.LatentDirichletAllocation`	Counts (not TF-IDF); choose K by coherence
Visualisation	`wordcloud.WordCloud`	Display tool, not analysis tool

Lecture 7 — NLP: Text Representation¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

0. Setup — packages required for this lecture¶

1. Warm-up — text → numbers → insight in 2 minutes¶

2. The NLP pipeline¶

3. Tokenisation, stopwords, stemming¶

4. Lemmatisation and POS tagging with spaCy¶

5. Bag-of-Words and TF-IDF¶

Bag-of-Words (BoW)¶

TF-IDF¶

5.bis Document similarity with cosine¶

Why not Euclidean distance?¶

Cosine similarity — the angle between two vectors¶

⏱ Ten-minute challenge — your first text-feature analysis¶

6. Topic modelling with LDA — what is the corpus about?¶

7. Keyword analysis: most discriminating terms¶

8. Wordclouds — visualisation, not analysis¶

Variant 1 — Classic rectangular wordcloud¶

Variant 2 — Mask shaped like €¶

Variant 3 — Image-coloured wordcloud (the parrot example)¶

9. Exercise¶

Summary¶

Next lecture¶