conda activate <your_env>
conda install -c conda-forge wbdata
pip install linearmodels

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

print(f"pandas      {pd.__version__}")
print(f"statsmodels {sm.__version__}")

# pip install linearmodels
try:
    from linearmodels.panel import PanelOLS, PooledOLS, RandomEffects
    import linearmodels
    print(f"linearmodels {linearmodels.__version__}")
except ImportError:
    print("linearmodels not installed — run: pip install linearmodels")

# Warm-up demo: live inflation data from the World Bank + annotated plot
import matplotlib.pyplot as plt
import pandas as pd

# Eurozone subset — ISO2 codes; the full English names come back from wbdata
countries = {"IT": "Italy", "DE": "Germany", "FR": "France",
             "ES": "Spain", "NL": "Netherlands"}

try:
    import wbdata
    infl = wbdata.get_dataframe(
        {"FP.CPI.TOTL.ZG": "inflation"},
        country=list(countries.keys()),
        date=("2000", "2023"),
    ).reset_index()
    # wbdata returns the country's full name — map back to the ISO2 code
    name_to_code = {v: k for k, v in countries.items()}
    infl["country"] = infl["country"].map(name_to_code)
    infl["year"] = infl["date"].astype(int)
    wide = infl.pivot(index="year", columns="country", values="inflation").sort_index()
except Exception as e:
    # Fallback to cached illustrative values if the API is unreachable in class
    msg = f"{type(e).__name__}: {e}"
    if isinstance(e, ModuleNotFoundError):
        msg += f"  [modulo mancante: {e.name}]"
    print(f"World Bank fetch failed ({msg}), using cached values.")
    years = list(range(2000, 2024))
    cached = {
        "IT": [2.6,2.3,2.6,2.8,2.3,2.2,2.2,2.0,3.5,0.8,1.5,2.7,3.0,1.2,0.2,0.0,-0.1,1.2,1.1,0.6,-0.1,1.9,8.2,5.6],
        "DE": [1.4,2.0,1.4,1.0,1.7,1.5,1.6,2.3,2.6,0.3,1.1,2.1,2.0,1.5,0.9,0.5,0.5,1.5,1.7,1.4,0.4,3.1,6.9,5.9],
        "FR": [1.7,1.6,1.9,2.1,2.1,1.7,1.7,1.5,2.8,0.1,1.5,2.1,2.0,0.9,0.5,0.0,0.2,1.0,1.9,1.1,0.5,1.6,5.2,4.9],
        "ES": [3.4,3.6,3.1,3.0,3.0,3.4,3.5,2.8,4.1,-0.3,1.8,3.2,2.4,1.4,-0.2,-0.5,-0.2,2.0,1.7,0.7,-0.3,3.1,8.4,3.5],
        "NL": [2.3,5.1,3.9,2.2,1.4,1.7,1.2,1.6,2.2,1.0,0.9,2.5,2.8,2.6,0.3,0.2,0.1,1.3,1.6,2.6,1.3,2.8,11.6,4.1],
    }
    wide = pd.DataFrame(cached, index=years)

fig, ax = plt.subplots(figsize=(10, 5))
wide.plot(ax=ax, linewidth=2)
ax.axhline(2, color="grey", linestyle=":", linewidth=0.8, label="ECB target (2%)")
ax.axvspan(2021, 2023, color="red", alpha=0.08, label="Energy shock")
ax.set_title("Eurozone inflation, 2000–2023 (% YoY CPI)", fontsize=13, loc="left")
ax.set_xlabel("")
ax.set_ylabel("Inflation, %")
ax.legend(loc="upper left", frameon=False, ncol=4)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.tight_layout()
plt.show()

# From a dict of lists
df = pd.DataFrame({
    "country":   ["Italy","Germany","France","Spain","Netherlands"],
    "code":      ["ITA","DEU","FRA","ESP","NLD"],
    "gdp":       [2010.0,4082.0,2794.0,1419.0,1118.0],
    "growth":    [0.73,-0.30,0.90,2.50,0.10],
    "inflation": [5.8,6.1,5.7,3.5,3.8],
})
df = df.set_index("code")
df["real_growth"] = df["growth"] - df["inflation"]
print(df.shape)
df.describe().round(2)

# Boolean filtering
print(df[(df["growth"] > 0) & (df["inflation"] < 5)])

# .query() syntax
print(df.query("growth > 0 and inflation < 5"))

panel = pd.DataFrame({
    "country":    ["Italy","Italy","Italy","Germany","Germany","Germany",
                   "Spain","Spain","Spain"],
    "year":       [2021,2022,2023,2021,2022,2023,2021,2022,2023],
    "gdp_growth": [6.58,7.31,5.29,2.88,7.47,5.49,5.55,10.02,6.85],
    "inflation":  [1.9,8.7,5.8,3.2,8.7,6.1,3.1,8.4,3.5],
    "unemployment":[9.5,8.1,6.7,3.7,3.0,2.9,14.8,12.9,11.8],
})

panel.groupby("country").agg(
    avg_growth   =("gdp_growth",   "mean"),
    avg_inflation=("inflation",    "mean"),
    avg_unemp    =("unemployment", "mean"),
).round(2)

import requests

countries = ["IT","DE","FR","ES","NL","PT","GR","BE","AT","FI"]
indicator = "NY.GDP.MKTP.KD.ZG"   # real GDP growth, annual %

url = f"https://api.worldbank.org/v2/country/{';'.join(countries)}/indicator/{indicator}"
params = {"format": "json", "date": "2000:2023", "per_page": 1000}

resp = requests.get(url, params=params, timeout=30)
resp.raise_for_status()           # raise an error if the HTTP status is not 2xx

payload = resp.json()

# The World Bank returns a two-element JSON array:
#   payload[0] = pagination metadata (page, pages, per_page, total, ...)
#   payload[1] = list of observations, one per (country, year)
print("Full request URL:", resp.url)
print("Metadata        :", payload[0])
print("\nFirst raw observation:")
print(payload[1][0])

# Build a clean DataFrame from the raw JSON observations.
# Each observation is a dict with keys: country, countryiso3code, date, value, ...
g = pd.DataFrame([
    {"country":    obs["country"]["id"],   # ISO2 code
     "year":       int(obs["date"]),
     "gdp_growth": obs["value"]}
    for obs in payload[1]
    if obs["value"] is not None           # drop years with no reported value
])

print(g.shape)
g.head()

import wbdata

countries = ["IT","DE","FR","ES","NL","PT","GR","BE","AT","FI"]

# A dict mapping World Bank indicator codes to the column names we want
indicators = {
    "NY.GDP.MKTP.KD.ZG": "gdp_growth",
    "SL.UEM.TOTL.ZS":    "unemployment",
}

df = wbdata.get_dataframe(
    indicators,
    country=countries,
    date=("2000", "2023"),
).reset_index()

# wbdata returns full country names ("Italy", "Germany", ...) — map back to ISO2
# so the country column matches what we saw in §6.1 and stays compact downstream.
name_to_code = {
    "Italy":"IT","Germany":"DE","France":"FR","Spain":"ES","Netherlands":"NL",
    "Portugal":"PT","Greece":"GR","Belgium":"BE","Austria":"AT","Finland":"FI",
}
df["country"] = df["country"].map(name_to_code)
df = df.rename(columns={"date": "year"})
df["year"] = df["year"].astype(int)
df = df.dropna().reset_index(drop=True)

df.to_csv("eurozone_panel.csv", index=False)
print(df.shape)
df.head()

# Reference solution — reveal only after the timer
import wbdata

eu = ["AT","BE","BG","HR","CY","CZ","DK","EE","FI","FR","DE","GR","HU",
      "IE","IT","LV","LT","LU","MT","NL","PL","PT","RO","SK","SI","ES","SE"]

yu = wbdata.get_dataframe(
    {"SL.UEM.1524.ZS": "youth_unemp"},
    country=eu,
    date=("2020", "2024"),
).reset_index().rename(columns={"date": "year"})
yu["year"] = yu["year"].astype(int)
yu = yu.dropna()

# Most recent year available per country
latest = (yu.sort_values("year").groupby("country").tail(1)
            .sort_values("youth_unemp"))

print("LOWEST youth unemployment:")
print(latest.head(3).to_string(index=False))
print("\nHIGHEST youth unemployment:")
print(latest.tail(3).to_string(index=False))

# Cross-sectional OLS: Okun's law
avg = df.groupby("country").agg(
    gdp_growth   =("gdp_growth",   "mean"),
    unemployment =("unemployment", "mean"),
).reset_index()

model_cs = smf.ols("gdp_growth ~ unemployment", data=avg).fit()
print(model_cs.summary())

# Robust standard errors (HC3)
model_robust = smf.ols("gdp_growth ~ unemployment", data=avg).fit(cov_type="HC3")

print(f"Coeff  : {model_robust.params['unemployment']:.3f}")
print(f"SE(HC3): {model_robust.bse['unemployment']:.3f}")
print(f"p-value: {model_robust.pvalues['unemployment']:.4f}")
print(f"95% CI : {model_robust.conf_int().loc['unemployment'].tolist()}")

from statsmodels.iolib.summary2 import summary_col
import re

m1 = smf.ols("gdp_growth ~ unemployment", data=df).fit()
m2 = smf.ols("gdp_growth ~ unemployment", data=df).fit(cov_type="HC3")
m3 = smf.ols("gdp_growth ~ unemployment + C(year)", data=df).fit(cov_type="HC3")

table = summary_col(
    [m1, m2, m3],
    model_names=["(1) OLS","(2) HC3","(3) Year FE"],
    stars=True,
    info_dict={"N":  lambda m: f"{int(m.nobs)}",
               "R2": lambda m: f"{m.rsquared:.3f}"}
)
print(table)

# Panel setup for linearmodels — requires MultiIndex (entity, time)
from linearmodels.panel import PanelOLS, PooledOLS, RandomEffects

panel_df = df.set_index(["country","year"])
print(panel_df.head(8))
print(f"\nDimensions: {panel_df.index.get_level_values(0).nunique()} countries x "
      f"{panel_df.index.get_level_values(1).nunique()} years")

# Pooled OLS — baseline (ignores panel structure)
res_pool = PooledOLS.from_formula(
    "gdp_growth ~ unemployment",
    data=panel_df
).fit(cov_type="clustered", cluster_entity=True)
print("Pooled OLS (clustered SE):")
print(res_pool.summary.tables[1])

# Two-way FE: country FE + year FE — manual year dummies
#
# Why manual dummies? Two bugs in linearmodels 7.0 × pandas 3.0 ruled out the
# shorter options:
#   1. `EntityEffects + TimeEffects` crashes in the two-way fast path
#      (`expand_categoricals` lacks an empty-frame guard in pandas 3.0).
#   2. `C(year_cat)` via the formula interface expands to a cartesian product
#      on MultiIndex panel data (rows blow up to n_obs × n_years).
# Safe path: build the year dummies ourselves and pass them as plain numeric
# regressors. Coefficients on `unemployment` are numerically identical to TWFE;
# we just see the year dummies printed explicitly rather than absorbed silently.

import pandas as pd

year_idx = panel_df.index.get_level_values("year")
year_dummies = pd.get_dummies(
    pd.Series(year_idx, index=panel_df.index, name="year"),
    prefix="yr", drop_first=True, dtype=float,
)

exog = pd.concat([panel_df[["unemployment"]], year_dummies], axis=1)

res_fe = PanelOLS(
    dependent=panel_df["gdp_growth"],
    exog=exog,
    entity_effects=True,
).fit(cov_type="clustered", cluster_entity=True)

print("Two-way FE (country + year), clustered SE:")
print(res_fe.summary.tables[1])

# Side-by-side comparison
cp = float(res_pool.params.iloc[0]); sp = float(res_pool.std_errors.iloc[0])
cf = res_fe.params["unemployment"];   sf = res_fe.std_errors["unemployment"]

print("=" * 52)
print(f"{'':28} {'Pooled':>10}  {'TWFE':>8}")
print("-" * 52)
print(f"{'unemployment':<28} {cp:>10.3f}   {cf:>8.3f}")
print(f"{'':28} ({sp:.3f})      ({sf:.3f})")
print(f"{'R2 within':<28} {'':>10}   {res_fe.rsquared_within:>8.3f}")
print(f"{'N':<28} {int(res_pool.nobs):>10}   {int(res_fe.nobs):>8}")
print("=" * 52)
print("SEs clustered at country level.")

# Random Effects — compare with FE (Hausman intuition)
try:
    res_re = RandomEffects.from_formula(
        "gdp_growth ~ unemployment", data=panel_df
    ).fit(cov_type="robust")
    cf = res_fe.params["unemployment"]
    cr = res_re.params["unemployment"]
    print(f"FE coefficient : {cf:.4f}")
    print(f"RE coefficient : {cr:.4f}")
    print(f"Difference     : {abs(cf-cr):.4f}")
    print()
    print("A large difference suggests the RE orthogonality assumption is violated")
    print("=> FE is the consistent estimator.")
except ZeroDivisionError:
    print("RandomEffects: numerical issue with this small panel.")
    print("Concept: RE assumes the entity effect is uncorrelated with regressors.")
    print("When violated, FE is the consistent estimator — which is what we used above.")

# Export to LaTeX (like esttab/outreg2 in Stata)
latex_out = res_fe.summary.as_latex()
with open("twfe_table.tex","w") as f:
    f.write(latex_out)
print("Saved: twfe_table.tex")
print()
for line in latex_out.split("\n")[:15]:
    print(line)

# Task 1 — download GDP per capita and inflation, merge with df
# YOUR CODE HERE

# Task 2 — four specifications: Pooled OLS, Pooled+inflation, Country FE, TWFE
# YOUR CODE HERE

# Task 3 — interaction: unemployment x post_crisis dummy
# YOUR CODE HERE

# Task 4 (optional) — LaTeX table of preferred specification
# YOUR CODE HERE

# SOLUTION ---------------------------------------------------------------
import pandas as pd, warnings
warnings.filterwarnings("ignore")
import wbdata
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from linearmodels.panel import PanelOLS, PooledOLS

countries = ["IT","DE","FR","ES","NL","PT","GR","BE","AT","FI"]
name_to_code = {
    "Italy":"IT","Germany":"DE","France":"FR","Spain":"ES","Netherlands":"NL",
    "Portugal":"PT","Greece":"GR","Belgium":"BE","Austria":"AT","Finland":"FI",
}

# Task 1
infl = wbdata.get_dataframe(
    {"FP.CPI.TOTL.ZG": "inflation"},
    country=countries,
    date=("2000", "2023"),
).reset_index().rename(columns={"date": "year"})
infl["country"] = infl["country"].map(name_to_code)
infl["year"] = infl["year"].astype(int)

df = pd.read_csv("eurozone_panel.csv"); df["year"] = df["year"].astype(int)
df = pd.merge(df, infl, on=["country","year"], how="left").dropna()
df["post_crisis"]  = (df["year"] >= 2008).astype(int)
df["unemp_x_post"] = df["unemployment"] * df["post_crisis"]
print("Merged:", df.shape)

# Task 2
panel = df.set_index(["country","year"])

# Build year dummies once — reused for TWFE specs (see lecture note in §8)
year_idx = panel.index.get_level_values("year")
year_dummies = pd.get_dummies(
    pd.Series(year_idx, index=panel.index, name="year"),
    prefix="yr", drop_first=True, dtype=float,
)

m1 = smf.ols("gdp_growth ~ unemployment", data=df).fit()
m2 = smf.ols("gdp_growth ~ unemployment + inflation", data=df).fit(cov_type="HC3")
m3 = PanelOLS.from_formula("gdp_growth ~ unemployment + EntityEffects",
     data=panel).fit(cov_type="clustered", cluster_entity=True)
exog_twfe = pd.concat([panel[["unemployment"]], year_dummies], axis=1)
m4 = PanelOLS(
    dependent=panel["gdp_growth"], exog=exog_twfe, entity_effects=True,
).fit(cov_type="clustered", cluster_entity=True)

print("\nCoefficient comparison (unemployment):")
print(f"  (1) Pooled OLS : {m1.params['unemployment']:.3f} ({m1.bse['unemployment']:.3f})")
print(f"  (2) + inflation: {m2.params['unemployment']:.3f} ({m2.bse['unemployment']:.3f})")
print(f"  (3) Country FE : {m3.params['unemployment']:.3f} ({m3.std_errors['unemployment']:.3f})")
print(f"  (4) TWFE       : {m4.params['unemployment']:.3f} ({m4.std_errors['unemployment']:.3f})")

# Task 3: interaction with post-crisis, on top of TWFE
exog_int = pd.concat(
    [panel[["unemployment","unemp_x_post"]], year_dummies],
    axis=1,
)
m5 = PanelOLS(
    dependent=panel["gdp_growth"], exog=exog_int, entity_effects=True,
    drop_absorbed=True,
).fit(cov_type="clustered", cluster_entity=True)
print("\nTask 3 - interaction results:")
print(m5.summary.tables[1])

# Task 4
with open("twfe_results.tex","w") as f:
    f.write(m4.summary.as_latex())
print("\nSaved: twfe_results.tex")

Code	Indicator	Units
`NY.GDP.MKTP.KD.ZG`	GDP growth, real	annual %
`NY.GDP.MKTP.CD`	GDP, nominal	current USD
`NY.GDP.MKTP.KD`	GDP, real	constant 2015 USD
`NY.GDP.PCAP.KD`	GDP per capita, real	constant 2015 USD
`NY.GDP.PCAP.KD.ZG`	GDP per capita growth	annual %
`NY.GDP.PCAP.PP.KD`	GDP per capita, PPP	constant 2017 international $

Code	Indicator	Units
`SL.UEM.TOTL.ZS`	Unemployment rate, total	% of labour force
`SL.UEM.1524.ZS`	Youth unemployment (15–24)	% of youth labour force
`SL.TLF.CACT.ZS`	Labour force participation	% of population 15+
`SL.TLF.CACT.FE.ZS`	Female labour force participation	% of female pop. 15+
`SL.EMP.TOTL.SP.ZS`	Employment-to-population ratio	% of pop. 15+

Code	Indicator	Units
`FP.CPI.TOTL.ZG`	CPI inflation	annual %
`FP.CPI.TOTL`	Consumer price index	2010 = 100
`NY.GDP.DEFL.KD.ZG`	GDP deflator, inflation	annual %
`PA.NUS.FCRF`	Official exchange rate	LCU per USD, period avg

Code	Indicator	Units
`GC.DOD.TOTL.GD.ZS`	Central government debt	% of GDP
`GC.XPN.TOTL.GD.ZS`	General government expenditure	% of GDP
`GC.REV.XGRT.GD.ZS`	General government revenue (ex. grants)	% of GDP
`GC.BAL.CASH.GD.ZS`	Cash surplus / deficit	% of GDP
`GC.TAX.TOTL.GD.ZS`	Tax revenue	% of GDP

Code	Indicator	Units
`NE.EXP.GNFS.ZS`	Exports of goods and services	% of GDP
`NE.IMP.GNFS.ZS`	Imports of goods and services	% of GDP
`BN.CAB.XOKA.GD.ZS`	Current account balance	% of GDP
`BX.KLT.DINV.WD.GD.ZS`	FDI, net inflows	% of GDP

Lecture 3 — Pandas, APIs & Econometrics¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

0. Warm-up — from API to chart in 30 seconds¶

1–5. Pandas essentials¶

6. Fetching real data from public APIs¶

6.1 — Talking to the API by hand with `requests`¶

6.2 — The same thing with a wrapper: `wbdata`¶

World Bank indicators — mini dictionary¶

⏱ Five-minute challenge¶

7. OLS Regression with `statsmodels`¶

7.1 Cross-sectional OLS: Okun's law¶

8. Panel Data with Fixed Effects using `linearmodels`¶

The Bellman equation of econometrics¶

9. Exercise¶

Solution¶

Summary¶

Next lecture¶

Code	Indicator	Units
`FM.LBL.BMNY.GD.ZS`	Broad money (M2)	% of GDP
`FR.INR.RINR`	Real interest rate	%
`FS.AST.DOMS.GD.ZS`	Domestic credit provided by financial sector	% of GDP

Code	Indicator	Units
`SP.POP.TOTL`	Population, total	persons
`SP.POP.GROW`	Population growth	annual %
`SP.DYN.LE00.IN`	Life expectancy at birth	years
`SP.URB.TOTL.IN.ZS`	Urban population	% of total
`SE.ADT.LITR.ZS`	Adult literacy rate	% of pop. 15+
`SE.XPD.TOTL.GD.ZS`	Government expenditure on education	% of GDP

Code	Indicator	Units
`SI.POV.GINI`	Gini index	0–100
`SI.POV.DDAY`	Poverty headcount at $2.15/day (2017 PPP)	% of pop.
`SI.DST.10TH.10`	Income share held by highest 10%	%

Code	Indicator	Units
`CC.EST`	Control of corruption, estimate	–2.5 to 2.5
`RQ.EST`	Regulatory quality, estimate	–2.5 to 2.5
`RL.EST`	Rule of law, estimate	–2.5 to 2.5
`VA.EST`	Voice and accountability, estimate	–2.5 to 2.5

Task	Tool	Stata equivalent
OLS	`smf.ols().fit()`	`reg y x`
Robust SE	`.fit(cov_type="HC3")`	`reg y x, robust`
Year dummies	`C(year)` in formula	`xi: reg y x i.year`
Pooled OLS (panel)	`PooledOLS.from_formula()`	`reg y x`
Country FE	`EntityEffects`	`xtreg y x, fe`
Two-way FE	`EntityEffects + TimeEffects`	`reghdfe y x, absorb(c y)`
Clustered SE	`cov_type="clustered"`	`, cluster(country)`
LaTeX export	`.summary.as_latex()`	`esttab`

Lecture 3 — Pandas, APIs & Econometrics¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

0. Warm-up — from API to chart in 30 seconds¶

1–5. Pandas essentials¶

6. Fetching real data from public APIs¶

6.1 — Talking to the API by hand with requests¶

6.2 — The same thing with a wrapper: wbdata¶

World Bank indicators — mini dictionary¶

⏱ Five-minute challenge¶

7. OLS Regression with statsmodels¶

7.1 Cross-sectional OLS: Okun's law¶

8. Panel Data with Fixed Effects using linearmodels¶

The Bellman equation of econometrics¶

9. Exercise¶

Solution¶

Summary¶

Next lecture¶

6.1 — Talking to the API by hand with `requests`¶

6.2 — The same thing with a wrapper: `wbdata`¶

7. OLS Regression with `statsmodels`¶

8. Panel Data with Fixed Effects using `linearmodels`¶