conda activate your_environment
conda install -c conda-forge scikit-learn pandas matplotlib spacy
python -m spacy download en_core_web_md     # ~50 MB English model with word vectors
pip install transformers torch              # HuggingFace stack (no conda-forge build)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, re, warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (classification_report, confusion_matrix,
                              ConfusionMatrixDisplay, roc_auc_score)
from sklearn.pipeline import Pipeline

print("sklearn loaded.")

# Open-source transformer, runs locally, completely free.
# First run downloads ~1.5 GB and caches it in ~/.cache/huggingface/
# (Recommended: warm the cache by running this cell at home before class.)

from transformers import pipeline as hf_pipeline

zs_classifier = hf_pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=-1,   # -1 = CPU; 0 = first GPU if available
)
print("Zero-shot classifier loaded.")

# Classify headlines into policy areas - no labelled data, no training, no payment.

headlines = [
    "ECB raises rates by 25bp to anchor inflation expectations",
    "Commission proposes new fiscal rules with flexibility for investment",
    "Eurozone unemployment falls to record low in Q3",
    "Germany announces 100bn special fund for defence",
    "ECB signals patience on further hikes as core inflation moderates",
]

policy_areas = ["monetary policy", "fiscal policy", "labour", "defence", "other"]

print(f"{'Predicted label':<20}  {'Confidence':<11}  Headline")
print("-" * 110)
for h in headlines:
    result = zs_classifier(h, candidate_labels=policy_areas)
    label = result["labels"][0]
    score = result["scores"][0]
    print(f"{label:<20}  {score:.2f}         {h}")

# Load the Manifesto Project dataset (public, available at manifesto-project.wzb.eu)
# For this lecture we use a representative synthetic dataset that mirrors
# the structure of the real one. In the exercise you will work with the real data.

np.random.seed(42)
n = 300

# Phrases that lean left or right but are NOT perfectly diagnostic
left_phrases = [
    "workers should have stronger protections",
    "public investment in healthcare is needed",
    "inequality has grown too much",
    "social safety nets need strengthening",
    "the wealthy should contribute more in taxes",
    "collective bargaining helps wage growth",
    "income support programs reduce poverty",
    "climate action requires public spending",
    "housing affordability is a serious problem",
    "public services should be expanded",
    "regulation can prevent market failures",
    "the welfare state needs reinforcement",
]
right_phrases = [
    "markets generally allocate resources well",
    "government spending should be controlled",
    "lower taxes can stimulate growth",
    "individual choice matters in economic outcomes",
    "regulation can impose unnecessary costs",
    "the private sector drives most innovation",
    "fiscal discipline matters for credibility",
    "competition policy benefits consumers",
    "property rights underpin investment",
    "trade openness has aggregate benefits",
    "government programs should be evaluated for efficiency",
    "labor market flexibility supports employment",
]

# Neutral vocabulary - phrases either side might use
shared_phrases = [
    "the economy faces multiple challenges",
    "policy tradeoffs need to be considered",
    "data should inform decision making",
    "long term outcomes matter",
    "the evidence is mixed",
    "economic growth has slowed recently",
    "inflation concerns are widespread",
    "labor markets are evolving",
    "households face rising costs",
    "investment decisions depend on conditions",
]

def make_doc(primary_phrases, primary_share=0.55, shared_share=0.30):
    """Generate a noisy document: mostly primary, some neutral, a bit of opponent."""
    n_total   = np.random.randint(6, 12)
    n_primary = max(1, int(n_total * primary_share))
    n_shared  = max(1, int(n_total * shared_share))
    n_other   = max(0, n_total - n_primary - n_shared)
    other = left_phrases if primary_phrases is right_phrases else right_phrases

    parts = list(np.random.choice(primary_phrases, n_primary, replace=True))
    parts += list(np.random.choice(shared_phrases, n_shared, replace=True))
    if n_other > 0:
        parts += list(np.random.choice(other, n_other, replace=True))
    np.random.shuffle(parts)
    return " ".join(parts)

texts  = ([make_doc(left_phrases)  for _ in range(n//2)] +
          [make_doc(right_phrases) for _ in range(n//2)])
labels = ["left"] * (n//2) + ["right"] * (n//2)

manifesto = pd.DataFrame({"text": texts, "orientation": labels})
manifesto = manifesto.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Dataset: {len(manifesto)} documents")
print(manifesto["orientation"].value_counts())
manifesto.head(3)

# Train/test split - stratified to preserve class balance
X_train, X_test, y_train, y_test = train_test_split(
    manifesto["text"],
    manifesto["orientation"],
    test_size=0.2,
    random_state=42,
    stratify=manifesto["orientation"]
)
print(f"Train: {len(X_train)}  |  Test: {len(X_test)}")

# Pipeline: TF-IDF -> Logistic Regression
pipeline_lr = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=2000,
        ngram_range=(1, 2),   # unigrams and bigrams
        min_df=2,
        stop_words="english",
        sublinear_tf=True,
    )),
    ("clf", LogisticRegression(
        C=1.0,               # inverse regularisation strength
        max_iter=1000,
        random_state=42,
    )),
])

pipeline_lr.fit(X_train, y_train)
y_pred = pipeline_lr.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, y_pred, digits=3))

# Naive Bayes - fast baseline, often competitive on text
pipeline_nb = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english",
                               min_df=2, sublinear_tf=False)),
    ("clf",   MultinomialNB(alpha=1.0)),
])

pipeline_nb.fit(X_train, y_train)
y_pred_nb = pipeline_nb.predict(X_test)

print("Naive Bayes:")
print(classification_report(y_test, y_pred_nb, digits=3))

# Confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

for ax, (model_name, y_p) in zip(axes, [("Logistic Regression", y_pred), ("Naive Bayes", y_pred_nb)]):
    cm  = confusion_matrix(y_test, y_p, labels=["left","right"])
    disp = ConfusionMatrixDisplay(cm, display_labels=["left","right"])
    disp.plot(ax=ax, colorbar=False, cmap="Blues")
    ax.set_title(f"Confusion matrix - {model_name}", fontsize=11)

fig.tight_layout()
plt.show()

# Most predictive features - what does the model actually learn?
feature_names = pipeline_lr.named_steps["tfidf"].get_feature_names_out()
coefs         = pipeline_lr.named_steps["clf"].coef_[0]   # positive = "right", negative = "left"

coef_df = pd.DataFrame({"feature": feature_names, "coef": coefs})
coef_df = coef_df.sort_values("coef")

top_left  = coef_df.head(15)   # most negative = most "left"
top_right = coef_df.tail(15)   # most positive = most "right"

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))

ax1.barh(top_left["feature"], top_left["coef"], color="steelblue", edgecolor="white")
ax1.set_title("Top 'left' features", fontsize=11)
ax1.set_xlabel("Coefficient")

ax2.barh(top_right["feature"], top_right["coef"], color="tomato", edgecolor="white")
ax2.set_title("Top 'right' features", fontsize=11)
ax2.set_xlabel("Coefficient")

fig.suptitle("Most predictive terms - Logistic Regression", fontsize=12, y=1.02)
fig.tight_layout()
plt.show()

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_lr = cross_val_score(pipeline_lr, manifesto["text"], manifesto["orientation"],
                             cv=cv, scoring="f1_macro")
scores_nb = cross_val_score(pipeline_nb, manifesto["text"], manifesto["orientation"],
                             cv=cv, scoring="f1_macro")

print(f"Logistic Regression - CV F1: {scores_lr.mean():.3f} +/- {scores_lr.std():.3f}")
print(f"Naive Bayes          - CV F1: {scores_nb.mean():.3f} +/- {scores_nb.std():.3f}")

# Regularisation: effect of C on performance
C_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
cv_means, cv_stds = [], []
for C in C_values:
    p = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english", sublinear_tf=True)),
        ("clf",   LogisticRegression(C=C, max_iter=1000, random_state=42)),
    ])
    s = cross_val_score(p, manifesto["text"], manifesto["orientation"],
                        cv=cv, scoring="f1_macro")
    cv_means.append(s.mean()); cv_stds.append(s.std())

fig, ax = plt.subplots(figsize=(7, 4))
ax.errorbar(range(len(C_values)), cv_means, yerr=cv_stds,
            fmt="o-", color="steelblue", capsize=4, linewidth=2)
ax.set_xticks(range(len(C_values)))
ax.set_xticklabels([str(c) for c in C_values])
ax.set_title("CV F1 vs regularisation strength C", fontsize=12)
ax.set_xlabel("C (inverse regularisation)"); ax.set_ylabel("F1 macro (5-fold CV)")
fig.tight_layout(); plt.show()

# Using pre-trained embeddings with spaCy (medium or large model needed for vectors)
# python -m spacy download en_core_web_md

try:
    import spacy
    nlp_md = spacy.load("en_core_web_md")
    has_vectors = True
    print("spaCy medium model loaded - word vectors available.")
except OSError:
    has_vectors = False
    print("en_core_web_md not available. Run: python -m spacy download en_core_web_md")
    print("Demonstrating the concept only.")

if has_vectors:
    # Word similarity via cosine distance in embedding space
    pairs = [("inflation","price"), ("inflation","unemployment"),
             ("rate","interest"), ("rate","percentage"), ("bank","money")]
    for w1, w2 in pairs:
        t1, t2 = nlp_md(w1), nlp_md(w2)
        sim = t1.similarity(t2)
        print(f"  {w1!r} - {w2!r}: {sim:.3f}")

# Document embeddings: average of word vectors
if has_vectors:
    def doc_embedding(text):
        doc = nlp_md(str(text))
        vectors = [token.vector for token in doc
                   if not token.is_stop and token.has_vector]
        if not vectors:
            return np.zeros(nlp_md.vocab.vectors_length)
        return np.mean(vectors, axis=0)

    # Apply to manifesto sample (first 50 docs for speed)
    sample = manifesto.head(50).copy()
    embeddings = np.array([doc_embedding(t) for t in sample["text"]])
    print(f"Embedding matrix: {embeddings.shape}  (documents x dimensions)")

    # Quick classifier on top of embeddings
    from sklearn.svm import LinearSVC
    X_e_tr, X_e_te, y_e_tr, y_e_te = train_test_split(
        embeddings, sample["orientation"], test_size=0.2, random_state=42)
    svm = LinearSVC(max_iter=5000).fit(X_e_tr, y_e_tr)
    print(f"SVM on embeddings - accuracy: {svm.score(X_e_te, y_e_te):.3f}")
else:
    print("Skipping embedding classifier - spaCy medium model not available.")

# We already loaded zs_classifier in the warm-up; reuse it here.
# This is the same call we used on headlines, now applied to a more nuanced task:
# distinguishing hawkish vs dovish vs neutral central bank statements.

candidate_labels = ["hawkish monetary policy", "dovish monetary policy", "neutral statement"]

texts_to_classify = [
    "The Governing Council decided to raise rates by 75 basis points. Inflation is far too high.",
    "The Governing Council decided to keep rates unchanged. The economy is recovering gradually.",
    "The Governing Council decided to cut rates. The inflation outlook has improved significantly.",
]

print("Zero-shot classification (BART-MNLI):")
for text in texts_to_classify:
    result = zs_classifier(text, candidate_labels=candidate_labels)
    top_label = result["labels"][0]
    top_score = result["scores"][0]
    print(f"\n  [{top_label}  {top_score:.2f}]")
    print(f"  {text[:90]}...")

# FinBERT - domain-specific sentiment for financial text
# Downloads ~440 MB on first run; cached afterwards in ~/.cache/huggingface/

finbert = hf_pipeline(
    "text-classification",
    model="ProsusAI/finbert",
    device=-1,
)

financial_texts = [
    "The ECB raised rates by 75bp to combat persistent inflation.",
    "Eurozone unemployment fell to a record low, supporting consumer spending.",
    "The economy contracted by 0.4% in Q3 amid the ongoing energy shock.",
    "Quarterly earnings beat analyst expectations by a wide margin.",
    "The central bank signalled prolonged restrictive policy.",
]

print("FinBERT sentiment classification:")
print("-" * 80)
for text in financial_texts:
    result = finbert(text)[0]
    print(f"  [{result['label']:<8s}  {result['score']:.2f}]  {text}")

# Compare FinBERT vs BART-MNLI on the same text

test_text = "The Governing Council maintains a data-dependent approach. Risks to the inflation outlook are broadly balanced."

# FinBERT - returns one of three fixed labels
fb_result = finbert(test_text)[0]
print("FinBERT (fine-tuned on financial news):")
print(f"  {fb_result['label']:<10s} {fb_result['score']:.3f}")

# BART-MNLI - returns scores for arbitrary labels you provide
bart_result = zs_classifier(test_text,
                             candidate_labels=["hawkish", "dovish", "neutral"])
print("\nBART-MNLI (zero-shot, your own labels):")
for lab, sc in zip(bart_result["labels"], bart_result["scores"]):
    print(f"  {lab:<10s} {sc:.3f}")

# Your turn: paste your article, define three label sets, run.
#
# 1. Replace the article placeholder with the text of a real news article.
# 2. Replace each label set with three labels relevant to YOUR research question.
# 3. Run the cell.

article = """
PASTE YOUR ARTICLE HERE
"""

label_sets = {
    "Set A — describe it":  ["label_1", "label_2", "label_3"],
    "Set B — describe it":  ["label_1", "label_2", "label_3"],
    "Set C — describe it":  ["label_1", "label_2", "label_3"],
}

# YOUR CODE BELOW — call zs_classifier on `article` for each label set,
# then print the top label and confidence for each.

# -- SOLUTION ------------------------------------------------------------------
# A worked example using a real ECB rate-hike article and three label sets
# that operationalise three different research questions.

article = """\
FRANKFURT - The European Central Bank raised its three key interest rates by 25 basis points
on Thursday, taking the deposit facility rate to 4.00%. President Lagarde said inflation is
expected to remain too high for too long, citing core inflation of 5.3% in August. The
Governing Council remains committed to ensuring inflation returns to its 2% medium-term target
in a timely manner. Markets had priced in a 65% probability of the hike.
"""

label_sets = {
    "Set A — Broad topic":    ["economy", "politics", "society", "technology"],
    "Set B — Policy stance":  ["hawkish", "dovish", "neutral"],
    "Set C — Article tone":   ["positive outlook", "negative outlook", "mixed signals"],
}

for name, labels in label_sets.items():
    result = zs_classifier(article, candidate_labels=labels)
    print(f"{name}:")
    for lab, sc in zip(result['labels'], result['scores']):
        print(f"  {lab:<22s} {sc:.3f}")
    print()

pip install anthropic openai

from anthropic import Anthropic
from openai import OpenAI

claude        = Anthropic()    # reads ANTHROPIC_API_KEY from env
openai_client = OpenAI()       # reads OPENAI_API_KEY     from env

import json

def extract_structured(article_text, provider="claude"):
    \"\"\"Extract structured fields from a news article as JSON.\"\"\"
    prompt = (
        "Extract the following structured information from this news article. "
        "Respond with ONLY valid JSON, no markdown fences, no commentary.\n\n"
        "Required fields:\n"
        "  main_actor (str): the primary actor in the story\n"
        "  action_taken (str): one short sentence\n"
        "  policy_area (str): one of [monetary, fiscal, labour, defence, trade, other]\n"
        "  numbers_mentioned (list of str): all numeric quantities with their units\n"
        "  sentiment (str): one of [positive, neutral, negative]\n\n"
        f"Article:\n{article_text}"
    )
    if provider == "claude":
        resp = claude.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=400,
            messages=[{"role": "user", "content": prompt}],
        )
        return json.loads(resp.content[0].text)
    else:
        resp = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            max_tokens=400,
            temperature=0,
            response_format={"type": "json_object"},
            messages=[{"role": "user", "content": prompt}],
        )
        return json.loads(resp.choices[0].message.content)

# Example: extract structured data from the ECB article used in the challenge
result = extract_structured(article, provider="claude")
print(json.dumps(result, indent=2))

def estimate_llm_cost(n_docs,
                      avg_input_tokens=500,
                      avg_output_tokens=50,
                      input_price_per_mtok=1.00,
                      output_price_per_mtok=5.00,
                      model_name="Claude Haiku 4.5"):
    """Rough cost estimate for an LLM classification/extraction run."""
    in_tok  = n_docs * avg_input_tokens
    out_tok = n_docs * avg_output_tokens
    cost = (in_tok / 1e6) * input_price_per_mtok + (out_tok / 1e6) * output_price_per_mtok
    print(f"Model           : {model_name}")
    print(f"  Documents     : {n_docs:,}")
    print(f"  Input tokens  : {in_tok:>12,}  @ ${input_price_per_mtok}/MTok")
    print(f"  Output tokens : {out_tok:>12,}  @ ${output_price_per_mtok}/MTok")
    print(f"  Est. cost     : ${cost:.4f}")
    print()

# Compare across providers on a hypothetical 5,000-document corpus
N_DOCS = 5_000
estimate_llm_cost(N_DOCS, 500, 50, 1.00, 5.00,  "Claude Haiku 4.5")
estimate_llm_cost(N_DOCS, 500, 50, 3.00, 15.00, "Claude Sonnet 4.6")
estimate_llm_cost(N_DOCS, 500, 50, 0.15, 0.60,  "GPT-4o-mini")
estimate_llm_cost(N_DOCS, 500, 50, 0.25, 2.00,  "GPT-5-mini")

# Task 1 - supervised classifier
# YOUR CODE HERE

# Task 2 - zero-shot / LLM classification
# YOUR CODE HERE

# Task 3 - most predictive features
# YOUR CODE HERE

# -- SOLUTION ------------------------------------------------------------------
# This solution uses the synthetic `manifesto` dataset built earlier in the lecture.
# For your own corpus, replace `manifesto` with your DataFrame and adapt
# the label column accordingly.

import numpy as np, pandas as pd, matplotlib.pyplot as plt, warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.pipeline import Pipeline

# ── Task 1: supervised classifier on TF-IDF ──────────────────────────────────
X_tr, X_te, y_tr, y_te = train_test_split(
    manifesto["text"], manifesto["orientation"],
    test_size=0.25, random_state=42, stratify=manifesto["orientation"])

pipeline_sol = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=2000, ngram_range=(1, 2),
                               min_df=2, stop_words="english", sublinear_tf=True)),
    ("clf",   LogisticRegression(C=1.0, max_iter=1000, random_state=42)),
])
pipeline_sol.fit(X_tr, y_tr)
y_pred_sol = pipeline_sol.predict(X_te)

print("Task 1 - Logistic Regression classification report:")
print(classification_report(y_te, y_pred_sol, digits=3))

fig, ax = plt.subplots(figsize=(5, 4))
ConfusionMatrixDisplay(confusion_matrix(y_te, y_pred_sol, labels=["left","right"]),
                        display_labels=["left","right"]
                        ).plot(ax=ax, colorbar=False, cmap="Blues")
ax.set_title("Confusion matrix - LogReg on manifesto"); fig.tight_layout(); plt.show()

# ── Task 2: zero-shot classification on a sample ─────────────────────────────
sample = manifesto.sample(15, random_state=42).reset_index(drop=True)
candidate_labels = ["left-wing political position", "right-wing political position"]

print("\nTask 2 - Zero-shot (BART-MNLI) on 15 sample documents:")
print("-" * 95)
n_agree = 0
for _, row in sample.iterrows():
    zs = zs_classifier(row["text"][:300], candidate_labels=candidate_labels)
    zs_label = "left" if "left" in zs["labels"][0] else "right"
    score = zs["scores"][0]
    agree = "✓" if zs_label == row["orientation"] else "✗"
    if zs_label == row["orientation"]:
        n_agree += 1
    print(f"  true={row['orientation']:<6}  zs={zs_label:<6} ({score:.2f}) {agree}  {row['text'][:55]}...")

print(f"\nAgreement with synthetic labels: {n_agree}/{len(sample)} = {n_agree/len(sample):.1%}")

# ── Task 3: most predictive features from the supervised classifier ──────────
fnames = pipeline_sol.named_steps["tfidf"].get_feature_names_out()
coefs  = pipeline_sol.named_steps["clf"].coef_[0]
coef_df = pd.DataFrame({"feature": fnames, "coef": coefs}).sort_values("coef")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.barh(coef_df.head(5)["feature"], coef_df.head(5)["coef"],
         color="steelblue", edgecolor="white")
ax1.set_title("Top 5 'left' features"); ax1.set_xlabel("Coefficient")
ax2.barh(coef_df.tail(5)["feature"], coef_df.tail(5)["coef"],
         color="tomato", edgecolor="white")
ax2.set_title("Top 5 'right' features"); ax2.set_xlabel("Coefficient")
fig.suptitle("Task 3 - Most predictive bigrams", y=1.02)
fig.tight_layout(); plt.show()

Dimension	Manual pipeline (TF-IDF + LogReg)	Transformer locally (BART, FinBERT)	LLM via API (Claude, GPT)
Cost	Zero marginal	Compute time (free), 1-2 sec/doc on CPU	Per-call payment, scales linearly with corpus
Reproducibility	Deterministic; replicable forever	Deterministic; pinned by model hash	Non-deterministic unless temperature=0; model can be retired
Interpretability	Every word has a weight; you can audit it	Black box; some attention-based explainability	Black box; no "why"
Scale	Very fast (~10k docs/sec)	Moderate (~1 doc/sec CPU; ~50/sec GPU)	Slow and paid (~1-3 docs/sec, bottlenecked by API)
Flexibility	Fixed: one task per trained model	Zero-shot: arbitrary label sets at inference time	Maximum: arbitrary tasks, free-form output

Metric	Definition	Use when
Accuracy	% correctly classified	Classes are balanced
Precision	True positives / predicted positives	Cost of false positives is high
Recall	True positives / actual positives	Cost of false negatives is high
F1	Harmonic mean of precision and recall	Imbalanced classes
AUC-ROC	Area under ROC curve	Probabilistic ranking

Provider	Model	Input ($/MTok)	Output ($/MTok)	Notes
Anthropic	Claude Haiku 4.5	$1.00	$5.00	Default for classification / extraction
Anthropic	Claude Sonnet 4.6	$3.00	$15.00	Complex reasoning, nuanced text
Anthropic	Claude Opus 4.7	$5.00	$25.00	Hardest tasks, longest context
OpenAI	gpt-4o-mini	$0.15	$0.60	Cheap, well-documented legacy default
OpenAI	gpt-5-mini	$0.25	$2.00	Cheap with stronger reasoning
OpenAI	gpt-5	$1.25	$10.00	OpenAI flagship-lite
OpenAI	gpt-5.4	$2.50	$15.00	OpenAI current flagship

Workload	Claude Haiku	GPT-4o-mini
Quick demo (10-20 calls)	< $0.001	< $0.001
1,000 ECB press releases (~500 in + 50 out tokens each)	~$0.75	~$0.10
50,000 news headlines (~30 in + 10 out)	~$4	~$0.50
5,000 long earnings transcripts (~5,000 in + 200 out)	~$30	~$4

Method	Use case	Library	Cost
Logistic regression on TF-IDF	Binary/multi-class classification	`sklearn`	Free
Naive Bayes	Fast baseline, sparse data	`sklearn`	Free
Cross-validation	Robust performance estimation	`sklearn`	Free
Word embeddings	Semantic similarity	`spacy` (en_core_web_md)	Free
BART-MNLI zero-shot	No labelled data, custom labels	`transformers`	Free, local
FinBERT	Financial sentiment, fine-tuned	`transformers` (`ProsusAI/finbert`)	Free, local
LLM API (Claude / OpenAI)	Structured extraction, complex tasks	`anthropic` / `openai`	Paid (~$1-30 for typical corpora)

Lecture 9 — Machine Learning for Text & LLMs via API¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

0. Setup — packages required for this lecture¶

1. Warm-up — the same task, then and now¶

The thing itself — zero-shot classification, free and local¶

2. When NOT to reach for these tools — a case for keeping your pipeline¶

3. Supervised text classification¶

4. Evaluation metrics¶

5. Cross-validation and overfitting on text data¶

6. Word embeddings: intuition¶

7. HuggingFace Transformers - zero-shot classification¶

8. Domain-specific BERT: FinBERT for financial sentiment¶

9. ⏱ Ten-minute challenge — labels matter¶

10. LLMs via API — Claude and OpenAI (optional, paid)¶

10a. Setting up API access¶

10b. Pricing (as of May 2026)¶

10c. Same task, two providers — a reusable pattern¶

10d. Structured extraction — where LLMs really pay off¶

10e. Cost estimation — run this BEFORE launching on a large corpus¶

11. Exercise¶

Wrap-up — what you now know how to do¶

Summary¶