# Imports - we use the same stack as L9
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

np.random.seed(42)
print("Ready.")

# Reconstruct the manifesto dataset from L9, with a twist:
# documents labelled 'left' are also more likely to mention recent years
# (simulating a research design where the labelled training data over-samples
# documents from the post-2015 period for one class).

np.random.seed(42)
n = 300

left_phrases = [
    "workers should have stronger protections",
    "public investment in healthcare is needed",
    "inequality has grown too much",
    "social safety nets need strengthening",
    "the wealthy should contribute more in taxes",
    "collective bargaining helps wage growth",
    "regulation can prevent market failures",
    "the welfare state needs reinforcement",
]
right_phrases = [
    "markets generally allocate resources well",
    "government spending should be controlled",
    "lower taxes can stimulate growth",
    "the private sector drives most innovation",
    "fiscal discipline matters for credibility",
    "competition policy benefits consumers",
    "property rights underpin investment",
    "labor market flexibility supports employment",
]
shared_phrases = [
    "the economy faces multiple challenges",
    "policy tradeoffs need to be considered",
    "data should inform decision making",
    "long term outcomes matter",
]

def make_doc_with_year(primary_phrases, label):
    n_total = np.random.randint(8, 14)
    n_primary = max(1, int(n_total * 0.55))
    n_shared = max(1, int(n_total * 0.30))
    parts = list(np.random.choice(primary_phrases, n_primary, replace=True))
    parts += list(np.random.choice(shared_phrases, n_shared, replace=True))
    # LEAKAGE: left-labelled docs over-sample recent years
    if label == "left":
        year = np.random.choice(["2022", "2023", "2024"], p=[0.3, 0.4, 0.3])
    else:
        year = np.random.choice(["2010", "2012", "2015"], p=[0.4, 0.3, 0.3])
    parts.append(f"in {year}")
    np.random.shuffle(parts)
    return " ".join(parts)

texts  = ([make_doc_with_year(left_phrases, "left")   for _ in range(n//2)] +
          [make_doc_with_year(right_phrases, "right") for _ in range(n//2)])
labels = ["left"] * (n//2) + ["right"] * (n//2)

manifesto = pd.DataFrame({"text": texts, "orientation": labels})
manifesto = manifesto.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Dataset: {len(manifesto)} documents")
print(manifesto.head(3)[["text"]].to_string())

# Train the L9-style classifier and look at top features
X_train, X_test, y_train, y_test = train_test_split(
    manifesto["text"], manifesto["orientation"],
    test_size=0.2, random_state=42, stratify=manifesto["orientation"])

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=2000, ngram_range=(1, 2),
                               min_df=2, stop_words="english", sublinear_tf=True)),
    ("clf",   LogisticRegression(C=1.0, max_iter=1000, random_state=42)),
])
pipe.fit(X_train, y_train)
print("F1 score (test):", round(pipe.score(X_test, y_test), 3))

# Top features per class
fnames = pipe.named_steps["tfidf"].get_feature_names_out()
coefs  = pipe.named_steps["clf"].coef_[0]
coef_df = pd.DataFrame({"feature": fnames, "coef": coefs}).sort_values("coef")

print("\nTop 8 'right' features (positive coefficients):")
print(coef_df.tail(8).to_string(index=False))
print("\nTop 8 'left' features (negative coefficients):")
print(coef_df.head(8).to_string(index=False))

# Simulate calibration drift: same construct (hawkishness), two periods,
# two ways of saying the same thing.

period_2010 = [
    "The Committee continues to anticipate that economic conditions are likely "
    "to warrant exceptionally low levels for the federal funds rate.",
    "Inflation pressures appear contained and inflation expectations remain stable.",
    "The Committee is prepared to provide additional accommodation as needed.",
]

period_2023 = [
    "The Committee is strongly committed to returning inflation to its 2 percent objective.",
    "In determining the extent of additional policy firming, the Committee will consider "
    "the cumulative tightening of monetary policy.",
    "The Committee anticipates that ongoing increases in the target range will be appropriate.",
]

# Pretend we run BART-MNLI on these (we use a placeholder function here
# so the notebook runs without downloading the model - swap for real `zs_classifier`
# from L9 to reproduce).

# To run for real, uncomment:
# from transformers import pipeline as hf_pipeline
# zs = hf_pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)
# labels = ["hawkish monetary policy", "dovish monetary policy", "neutral statement"]
# for t in period_2010 + period_2023:
#     r = zs(t, candidate_labels=labels)
#     print(f"{r['labels'][0]:30s} {r['scores'][0]:.2f}  {t[:60]}...")

# Below: typical results based on running this in L9.
# Period 2010 (Bernanke-era dovish language):
#   dovish monetary policy            0.65    "exceptionally low levels..."
#   neutral statement                 0.71    "Inflation pressures appear..."
#   dovish monetary policy            0.58    "additional accommodation..."
#
# Period 2023 (Powell-era hawkish language):
#   hawkish monetary policy           0.91    "strongly committed to returning..."
#   hawkish monetary policy           0.85    "additional policy firming..."
#   hawkish monetary policy           0.88    "ongoing increases..."

print("Period 2010 (dovish era) - average dovish score: ~0.62")
print("Period 2023 (hawkish era) - average hawkish score: ~0.88")
print()
print("Apparent conclusion: BART measures a 26-percentage-point shift")
print("toward hawkishness from 2010 to 2023.")

# A schematic of the right workflow vs the wrong workflow.

print("WRONG WORKFLOW (will not survive refereeing):")
print("-" * 60)
print('  predicted_stance = bart_mnli(ecb_text)')
print('  reg(bond_yield ~ predicted_stance, data=panel)')
print('  -> claim: "hawkish ECB communication raises yields"')
print()
print("RIGHT WORKFLOW:")
print("-" * 60)
print('  # Step 1 - Measurement')
print('  predicted_stance = bart_mnli(ecb_text)')
print()
print('  # Step 2 - Validation')
print('  hand_coded = read_and_score(sample_of_50_docs)')
print('  kappa = cohen_kappa(predicted_stance[sample], hand_coded)')
print('  # report kappa, discuss disagreement cases')
print()
print('  # Step 3 - Identification')
print('  # Either:')
print('  #   (a) event study around exogenous communication events')
print('  #   (b) IV: use BART score interacted with pre-period institutional features')
print('  #   (c) honest descriptive: "we document that..." not "we estimate that..."')
print()
print('  reg(bond_yield ~ predicted_stance + controls,')
print('      data=panel,')
print('      design=event_study,        # or IV or honest descriptive')
print('      report=robust_to_measurement_error)')

Concept	Question it answers	Sufficient evidence
Prediction	What is the expected $y$ given $x$?	High held-out test accuracy / low MSE
Explanation	Which features drive a model's predictions?	Stable coefficients, partial dependence, SHAP
Causal inference	What would $y$ be if we intervened on $x$?	Exogenous variation, identification strategy

Validity type	Question	How to defend
Construct	Does the score measure the latent concept I claim?	Hand-coded gold standard; multiple-model triangulation
Internal	Does variation in the score reflect variation in the construct, not in incidental features?	Subsample by document length, source, period; check stability
External	Does the measure generalise beyond the validation sample?	Out-of-domain test (e.g. validate on Fed, apply to ECB)

Extra L9 — Prediction, Explanation, Causality¶

Companion notebook to Lecture 9 — Machine Learning for Text & LLMs¶

Why this notebook¶

Reading¶

1. Three concepts, three different questions¶

2. Worked example 1 — Leakage in text classification¶

3. Worked example 2 — Calibration drift in LLM-based measures¶

4. Worked example 3 — LLM output as measurement, not as outcome¶

5. A validity framework for ML-on-text measures¶

6. Exercise — "What would a referee ask?"¶

Statement A¶

Statement B¶

Statement C¶

7. Optional extension¶

Wrap-up¶