import pandas as pd
import numpy as np

purchases = pd.DataFrame({
    "person_id": [1, 2, 3, 4, 5, 6],
    "household_id": [10, 10, 11, 12, 12, 13],
    "spending": [120, 80, 200, 150, 170, 90]
})

households = pd.DataFrame({
    "household_id": [10, 11, 12, 12, 13],
    "income_bracket": ["mid", "high", "low", "low", "mid"],
    "region": ["North", "North", "South", "South", "Center"]
})

purchases, households

(   person_id  household_id  spending
 0          1            10       120
 1          2            10        80
 2          3            11       200
 3          4            12       150
 4          5            12       170
 5          6            13        90,
    household_id income_bracket  region
 0            10            mid   North
 1            11           high   North
 2            12            low   South
 3            12            low   South
 4            13            mid  Center)

merged_bad = purchases.merge(households, on="household_id", how="left")
merged_bad

print("Rows in purchases:", len(purchases))
print("Rows after merge:", len(merged_bad))
print("Total spending before merge:", purchases["spending"].sum())
print("Total spending after merge:", merged_bad["spending"].sum())

Rows in purchases: 6
Rows after merge: 8
Total spending before merge: 810
Total spending after merge: 1130

household_key_counts = households["household_id"].value_counts().rename_axis("household_id").reset_index(name="count")
household_key_counts

problem_keys = household_key_counts.loc[household_key_counts["count"] > 1, "household_id"].tolist()
print("Duplicate keys on RHS:", problem_keys)

households[households["household_id"].isin(problem_keys)]

Duplicate keys on RHS: [12]

households_unique = households.drop_duplicates(subset=["household_id"])
merged_good = purchases.merge(households_unique, on="household_id", how="left")

print("Rows after corrected merge:", len(merged_good))
print("Total spending after corrected merge:", merged_good["spending"].sum())
merged_good

Rows after corrected merge: 6
Total spending after corrected merge: 810

def assert_unique(df, keys, name):
    dup = df.duplicated(subset=keys).sum()
    assert dup == 0, f"{name}: found {dup} duplicated key rows for {keys}"

def assert_preserves_rows(left, merged, left_name="left", merged_name="merged"):
    assert len(left) == len(merged), f"{left_name} has {len(left)} rows but {merged_name} has {len(merged)}"

def assert_preserves_sum(left, merged, col):
    assert left[col].sum() == merged[col].sum(), f"Column {col} sum changed after merge"

assert_unique(households_unique, ["household_id"], "households_unique")
assert_preserves_rows(purchases, merged_good)
assert_preserves_sum(purchases, merged_good, "spending")
print("All checks passed.")

All checks passed.

households_multi = pd.DataFrame({
    "household_id": [10, 10, 11, 12, 12, 13],
    "asset_value": [100, 150, 300, 80, 50, 120]
})

assets_agg = households_multi.groupby("household_id", as_index=False)["asset_value"].sum()
assets_agg

purchases.merge(assets_agg, on="household_id", how="left")

np.random.seed(42)

countries = ["ITA", "DEU", "FRA", "ESP"]
years = list(range(2018, 2024))

rows = []
for c in countries:
    for y in years:
        rows.append({
            "country": c,
            "year": y,
            "gdp_growth": np.random.normal(1.5, 1.2),
            "unemployment": np.random.uniform(5, 15)
        })

panel = pd.DataFrame(rows)
print("Shape:", panel.shape)
panel.head(8)

Shape: (24, 4)

def panel_balance_report(df, entity_col, time_col):
    obs_per_entity = df.groupby(entity_col)[time_col].count()
    n_entities = df[entity_col].nunique()
    n_periods  = df[time_col].nunique()
    n_cells    = n_entities * n_periods
    n_obs      = len(df)

    print(f"Entities : {n_entities}")
    print(f"Periods  : {n_periods}")
    print(f"Expected cells (balanced): {n_cells}")
    print(f"Actual observations      : {n_obs}")
    print(f"Missing cells            : {n_cells - n_obs}")
    print(f"Balanced                 : {n_obs == n_cells}")
    print()
    print("Observations per entity:")
    print(obs_per_entity.to_string())

panel_balance_report(panel, "country", "year")

Entities : 4
Periods  : 6
Expected cells (balanced): 24
Actual observations      : 24
Missing cells            : 0
Balanced                 : True

Observations per entity:
country
DEU    6
ESP    6
FRA    6
ITA    6

# Introduce some missingness to illustrate the unbalanced case
panel_unbal = panel.drop(index=[2, 7, 15]).reset_index(drop=True)
print("--- Unbalanced panel ---")
panel_balance_report(panel_unbal, "country", "year")

--- Unbalanced panel ---
Entities : 4
Periods  : 6
Expected cells (balanced): 24
Actual observations      : 21
Missing cells            : 3
Balanced                 : False

Observations per entity:
country
DEU    5
ESP    6
FRA    5
ITA    5

# Compute within-entity means
means = panel.groupby("country")[["gdp_growth", "unemployment"]].transform("mean")

panel_within = panel.copy()
panel_within["gdp_growth_dm"]    = panel["gdp_growth"]    - means["gdp_growth"]
panel_within["unemployment_dm"]  = panel["unemployment"]  - means["unemployment"]

# Sanity check: within-demeaned means should be zero for each country
check = panel_within.groupby("country")[["gdp_growth_dm", "unemployment_dm"]].mean().round(10)
print("Within-entity means after demeaning (should be ~0):")
print(check)

Within-entity means after demeaning (should be ~0):
         gdp_growth_dm  unemployment_dm
country                                
DEU                0.0              0.0
ESP               -0.0              0.0
FRA               -0.0              0.0
ITA               -0.0              0.0

# Add a time-invariant variable
area = {"ITA": 301, "DEU": 357, "FRA": 551, "ESP": 506}  # thousands of km²
panel_within["area"] = panel_within["country"].map(area)

# After within-demeaning, a time-invariant variable becomes exactly zero
panel_within["area_dm"] = panel_within["area"] - panel_within.groupby("country")["area"].transform("mean")

print("Variance of area after demeaning:", panel_within["area_dm"].var().round(12))
print("→ FE cannot estimate the effect of time-invariant regressors.")

Variance of area after demeaning: 0.0
→ FE cannot estimate the effect of time-invariant regressors.

import statsmodels.formula.api as smf

# OLS on demeaned variables (no intercept: absorbed by demeaning)
ols_within = smf.ols("gdp_growth_dm ~ unemployment_dm - 1", data=panel_within).fit()

# OLS with explicit country dummies — numerically equivalent
ols_dummy = smf.ols("gdp_growth ~ unemployment + C(country)", data=panel).fit()

print("Within estimator (manual demeaning):")
print(f"  coeff unemployment = {ols_within.params['unemployment_dm']:.4f}")
print()
print("OLS with country dummies (equivalent):")
print(f"  coeff unemployment = {ols_dummy.params['unemployment']:.4f}")
print()
print("Difference:", round(abs(ols_within.params['unemployment_dm'] - ols_dummy.params['unemployment']), 10))

Within estimator (manual demeaning):
  coeff unemployment = -0.1287

OLS with country dummies (equivalent):
  coeff unemployment = -0.1287

Difference: 0.0

# Between estimator: OLS on entity means
between = panel.groupby("country")[["gdp_growth", "unemployment"]].mean().reset_index()
ols_between = smf.ols("gdp_growth ~ unemployment", data=between).fit()

print("Between estimator (cross-sectional, country averages):")
print(f"  coeff unemployment = {ols_between.params['unemployment']:.4f}  "
      f"(SE = {ols_between.bse['unemployment']:.4f})")
print()
print("Within estimator (FE, within-country variation):")
print(f"  coeff unemployment = {ols_within.params['unemployment_dm']:.4f}  "
      f"(SE = {ols_within.bse['unemployment_dm']:.4f})")
print()
print("If these differ substantially, time-invariant confounders are likely at work.")
print("The FE (within) estimator is consistent under weaker assumptions.")

Between estimator (cross-sectional, country averages):
  coeff unemployment = 0.0974  (SE = 0.2324)

Within estimator (FE, within-country variation):
  coeff unemployment = -0.1287  (SE = 0.0649)

If these differ substantially, time-invariant confounders are likely at work.
The FE (within) estimator is consistent under weaker assumptions.

def panel_pre_flight(df, entity_col, time_col, y_col, x_cols):
    print("=== Panel pre-flight checklist ===")

    # 1. Unique (entity, time) key
    dup = df.duplicated(subset=[entity_col, time_col]).sum()
    print(f"[{'OK' if dup == 0 else 'FAIL'}] Unique (entity, time) key — duplicates: {dup}")

    # 2. Balance
    n_e = df[entity_col].nunique()
    n_t = df[time_col].nunique()
    balanced = len(df) == n_e * n_t
    print(f"[{'OK' if balanced else 'WARN'}] Balanced panel — {len(df)} obs, expected {n_e * n_t}")

    # 3. Missing values in key columns
    cols = [y_col] + x_cols
    missing = df[cols].isna().sum()
    any_missing = missing.any()
    print(f"[{'WARN' if any_missing else 'OK'}] Missing values in analysis columns:")
    for col, n in missing.items():
        if n > 0:
            print(f"      {col}: {n} missing")
    if not any_missing:
        print("      none")

    # 4. Within variation — FE requires x to vary within entities
    print("[INFO] Within-entity standard deviation of regressors:")
    within_sd = df.groupby(entity_col)[x_cols].std().mean()
    for col, sd in within_sd.items():
        flag = "OK" if sd > 0.01 else "WARN — near zero within variation"
        print(f"      {col}: {sd:.4f}  [{flag}]")

panel_pre_flight(panel, "country", "year", "gdp_growth", ["unemployment"])

=== Panel pre-flight checklist ===
[OK] Unique (entity, time) key — duplicates: 0
[OK] Balanced panel — 24 obs, expected 24
[OK] Missing values in analysis columns:
      none
[INFO] Within-entity standard deviation of regressors:
      unemployment: 2.7130  [OK]

	country	year	gdp_growth	unemployment
0	ITA	2018	2.096057	12.319939
1	ITA	2019	1.334083	10.986585
2	ITA	2020	1.219016	5.580836
3	ITA	2021	1.219036	13.661761
4	ITA	2022	3.395055	5.205845
5	ITA	2023	2.420922	14.699099
6	DEU	2018	0.936631	6.818250
7	DEU	2019	2.151072	6.834045

Extra L3 — Pandas Joins, Keys, and Data Integrity¶

Goal¶

Core lesson¶

1. Construct two small datasets¶

2. The bad merge¶

Interpretation¶

3. Diagnose the source of duplication¶

4. Fix 1: enforce uniqueness before merging¶

5. Validation checks you should always write¶

6. Fix 2: aggregate before merge¶

Short write-up prompt¶

Optional extension¶

Advanced section — Panel construction and the within estimator¶

A1. What does a well-formed panel look like?¶

A2. Check balance¶

A3. The within transformation — manually¶

A4. What the within transformation absorbs¶

A5. OLS on demeaned data reproduces the FE coefficient¶

A6. Between variation vs within variation¶

A7. Panel construction checklist¶

Advanced write-up prompt¶

Advanced optional extension¶

	person_id	household_id	spending	income_bracket	region
0	1	10	120	mid	North
1	2	10	80	mid	North
2	3	11	200	high	North
3	4	12	150	low	South
4	4	12	150	low	South
5	5	12	170	low	South
6	5	12	170	low	South
7	6	13	90	mid	Center

	household_id	count
0	12	2
1	10	1
2	11	1
3	13	1

	household_id	asset_value
0	10	250
1	11	300
2	12	130
3	13	120