import pandas as pd

# Replication checklist for dataset projects
dataset_checklist = pd.DataFrame({
    "category": [
        # Index quality (Priority 1)
        "Index", "Index", "Index", "Index", "Index",
        # Scraping reproducibility (Priority 2)
        "Scraping", "Scraping", "Scraping", "Scraping", "Scraping",
        # Honest interpretation (Priority 3)
        "Interpretation", "Interpretation", "Interpretation", "Interpretation", "Interpretation",
    ],
    "item": [
        # Index quality
        "Construct definition stated in one sentence (what the index measures, and what it does NOT)",
        "Index methodology described step-by-step (tokenisation, dictionary or model, aggregation rule)",
        "Validation sample: at least 20-50 documents hand-coded and compared to the index",
        "Sample anchor texts included in the appendix (3-5 documents with high/low/mid index values)",
        "Sensitivity check: index recomputed with one alternative choice (different dictionary, different model)",
        # Scraping reproducibility
        "Scraping script terminates cleanly on its own (no manual restarts hidden in comments)",
        "Checkpoint logic implemented: re-running picks up where the previous run stopped",
        "URL regex matches structural patterns, not CSS classes or fragile DOM positions",
        "Date parsing handles edge cases (`dropna` after `to_datetime`, document rejection rule explicit)",
        "Final corpus stats reported: N documents, date range, source, missing-value rate",
        # Honest interpretation
        "Claim type explicit ('we document', 'we estimate', 'we suggest') and matches the design",
        "If causal language used: identification strategy stated and threats discussed",
        "If descriptive: stated as such, no implicit causal verbs (`leads to`, `causes`, `drives`)",
        "Time-series plots accompanied by event annotations (so the reader can audit visually)",
        "Limitations section names at least 2 specific weaknesses (not generic 'more data needed')",
    ],
    "status": ["todo"] * 15
})

print(dataset_checklist.to_string(index=False))

# Empty template - fill in for YOUR project
import pandas as pd

project_map_template = pd.DataFrame({
    "stage": [
        "raw data acquisition",
        "cleaning & deduplication",
        "feature / index construction",
        "validation",
        "analysis",
        "figures & tables",
        "write-up"
    ],
    "file_or_script": ["", "", "", "", "", "", ""],
    "main_risk":      ["", "", "", "", "", "", ""],
    "how_validated":  ["", "", "", "", "", "", ""]
})

project_map_template

ecb_project_map = pd.DataFrame({
    "stage": [
        "raw data acquisition",
        "cleaning & deduplication",
        "feature / index construction",
        "validation",
        "analysis",
        "figures & tables",
        "write-up"
    ],
    "file_or_script": [
        "01_scrape_ecb_speeches.py",
        "02_clean_corpus.py",
        "03_build_hawkish_index.py",
        "04_validate_index.py",
        "05_analysis.py",
        "06_make_figures.py",
        "L9_paper_draft.tex"
    ],
    "main_risk": [
        "ECB site restructure breaks URL regex; new HTML tags introduce parsing errors",
        "Non-English speeches included by mistake; near-duplicates from translations inflate N",
        "Dictionary too narrow misses hedged hawkish language; too broad picks up unrelated terms",
        "Hand-coded sample too small (N<20) to estimate kappa with useful confidence",
        "Spurious time trend driven by composition shift (more press conferences, fewer formal speeches)",
        "Figure axes default to misleading scales; event annotations missing key dates",
        "Causal language slips into conclusions where only descriptive evidence exists"
    ],
    "how_validated": [
        "Re-run on different machine produces identical corpus (N=487, date range matches)",
        "Manual audit of 30 random documents: 100% English, 0 obvious duplicates",
        "Index correlates 0.74 with hand-coded scores on 50-doc validation sample (kappa=0.62)",
        "Disagreement cases documented in appendix; 3 of 7 are genuinely ambiguous",
        "Robustness check with subsample of formal speeches only — qualitative result holds",
        "Each figure has source note, date range, N, and 3+ vertical event lines (Lagarde, COVID, ZIRP exit)",
        "Conclusion explicitly states 'descriptive evidence' three times; no IV or RD claim made"
    ]
})

ecb_project_map

newspaper_project_map = pd.DataFrame({
    "stage": [
        "raw data acquisition",
        "cleaning & deduplication",
        "feature / index construction",
        "validation",
        "analysis",
        "figures & tables",
        "write-up"
    ],
    "file_or_script": [
        "01_scrape_corriere.py + 01b_scrape_repubblica.py",
        "02_clean_headlines.py",
        "03_build_immigration_share.py",
        "04_validate_against_gallagher.py",
        "05_event_study_2015_shock.py",
        "06_make_figures.py",
        "L9_paper_draft.tex"
    ],
    "main_risk": [
        "Paywall blocks recent articles for one paper; sample becomes unbalanced over time",
        "Headlines change over the day (live editing); my scrape captures one snapshot per day",
        "Keyword list misses synonyms ('migranti' vs 'rifugiati' vs 'sbarchi') and shifts in usage",
        "No external benchmark of newspaper attention; my measure cannot be triangulated",
        "Pre-trend in immigration coverage starts before the 2015 shock — anchoring at the wrong date",
        "Two newspapers plotted on same axis may dominate visually; need facet panels",
        "Tempting to claim 'immigration coverage drove vote shift' — only descriptive evidence supports this"
    ],
    "how_validated": [
        "Scraping log records HTTP status for each request; coverage rate documented per source/year",
        "Headline text re-scraped 3 days later for 5% sample; 12% differ (audit in appendix)",
        "Keyword list iterated with 5 native speakers; final list of 17 terms documented",
        "Compared against ITANES survey question 'most important issue' — correlation 0.41 (weak but positive)",
        "Pre-period robustness: trend estimated separately for 2010-2014; effect attenuated but persists",
        "Figure uses small multiples (one panel per newspaper); event annotation = April 2015 shipwreck",
        "Conclusion: 'we document a sharp and sustained increase in coverage after 2015' (descriptive, NOT causal)"
    ]
})

newspaper_project_map

Failure mode	What it looks like	How to fix
Construct ambiguity	"We measure economic uncertainty" without defining what uncertainty is	Define operationally: "the share of headlines containing one of {recession, crisis, downturn, uncertainty}"
Hidden construction choices	"We use an LDA topic model" without listing K, alpha, beta, preprocessing	Method appendix with every hyperparameter; sensitivity to one alternative
Descriptive graph as causal evidence	A time-series plot with the headline "$X$ leads to $Y$"	Headline should describe the pattern ("$X$ and $Y$ co-move from 2015"); causal language only with a design
No discussion of missing data	Sample N reported once at the start, then forgotten	One sentence on missingness rate per source/period; one sensitivity check on restricted sample
Generic limitations	"Future research should use more data"	Specific: "our index does not capture sarcasm; 4 of 50 validation documents were misclassified for this reason"
Robustness theatre	12 robustness tables that all confirm the main result, exactly	One serious robustness check aimed at the most plausible threat, even if it weakens the result

Priority	Score 1 (weak)	Score 2 (adequate)	Score 3 (strong)
Index quality	Construct not defined or no validation	Construct defined; informal validation	Defined, hand-coded sample, kappa reported
Scraping reproducibility	Manual fixes during scraping; not re-runnable	Script terminates cleanly; corpus stats reported	Checkpoint logic; documented re-run on different machine
Honest interpretation	Causal language without identification	Mostly descriptive; some causal slips	Claims match design; limitations specific

Extra — Research Design & Replication¶

Companion notebook for building a dataset¶

Reading¶

1. A replication checklist for dataset projects¶

2. Project map — what to fill in¶

3. Worked example 1 — ECB hawkish index, 2015-2025¶

4. Worked example 2 — Newspaper coverage of immigration, 2010-2024¶

5. From pipeline to write-up¶

6. Common failure modes in dataset construction¶

7. Writing prompt — your own pre-submission audit¶

8. Optional — convert the checklist into a peer rubric¶

Wrap-up¶