conda activate econ
conda install -c conda-forge selenium

conda activate econ
pip uninstall -y selenium
pip install --no-cache-dir --force-reinstall --only-binary=:all: selenium

# pip install requests beautifulsoup4 lxml selenium
# Note: since Selenium 4.6 (Oct 2022), the driver binary is fetched automatically
# by Selenium Manager — webdriver-manager is no longer needed.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, re, json
from pathlib import Path

print("Libraries loaded.")

# Demo: when `requests` isn't enough — the SAME URL, two approaches.
#
# Run HEADED in class so students see the browser open.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://quotes.toscrape.com/js/"

# --- Attempt 1: plain `requests` -------------------------------------------
print(">>> Attempt 1: requests + BeautifulSoup")
html = requests.get(URL, timeout=10).text
soup = BeautifulSoup(html, "lxml")
quotes_via_requests = soup.select("div.quote span.text")
print(f"   Quotes found: {len(quotes_via_requests)}")
print(f"   Length of HTML returned: {len(html):,} characters")
print()
# Spoiler: 0 quotes. The HTML is a JavaScript shell — the quotes will be
# generated *in the browser* once the JS runs. `requests` doesn't run JS.

# --- Attempt 2: Selenium ----------------------------------------------------
print(">>> Attempt 2: Selenium (real browser)")
options = Options()
# options.add_argument("--headless=new")   # un-comment for quiet runs
options.add_argument("--window-size=1280,900")
driver = webdriver.Chrome(options=options)

try:
    driver.get(URL)
    # Wait until the JS has populated the DOM with quote elements.
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.quote"))
    )
    quotes_via_selenium = driver.find_elements(By.CSS_SELECTOR, "div.quote span.text")
    authors             = driver.find_elements(By.CSS_SELECTOR, "div.quote small.author")
    print(f"   Quotes found: {len(quotes_via_selenium)}")
    print()
    print("   First three quotes:")
    for q, a in zip(quotes_via_selenium[:3], authors[:3]):
        print(f"     • {q.text[:70]}…  — {a.text}")
finally:
    driver.quit()

session = requests.Session()
session.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept":          "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection":      "keep-alive",
})

def safe_get(url, pause=1.5, sess=session):
    """GET request with error handling and polite delay."""
    time.sleep(pause)
    try:
        r = sess.get(url, timeout=15)
        r.raise_for_status()
        return r
    except requests.HTTPError as e:
        print(f"  HTTP {e.response.status_code}: {url}")
        return None
    except requests.RequestException as e:
        print(f"  Error: {e}")
        return None

r = safe_get("https://www.ecb.europa.eu/press/pr/date/2024/html/index_en.html")
print(r.status_code if r else "Failed")

# Pattern only — do not run without real credentials
def login(session, login_url, username, password):
    r = session.get(login_url)
    soup = BeautifulSoup(r.text, "lxml")
    token_tag = soup.find("input", {"name": "csrf_token"})
    csrf = token_tag["value"] if token_tag else ""
    payload = {"username": username, "password": password, "csrf_token": csrf}
    r_post = session.post(login_url, data=payload)
    return "logout" in r_post.text.lower() or r_post.url != login_url

print("Login function defined (not executed).")

# Pattern 1: page number in URL
def scrape_all_pages_numbered(base_url, max_pages=10):
    all_items = []
    for page in range(1, max_pages + 1):
        url = base_url.format(page=page)
        r = safe_get(url)
        if r is None:
            break
        soup = BeautifulSoup(r.text, "lxml")
        items = soup.find_all("div", class_="result-item")
        if not items:
            print(f"  No results on page {page} — stopping.")
            break
        all_items.extend(items)
        print(f"  Page {page}: {len(items)} items")
    return all_items

# Pattern 2: follow "next" link
def scrape_follow_next(start_url):
    all_items = []
    url  = start_url
    page = 1
    while url:
        r = safe_get(url)
        if r is None:
            break
        soup  = BeautifulSoup(r.text, "lxml")
        items = soup.find_all("article")
        all_items.extend(items)
        print(f"  Page {page}: {len(items)} items")
        next_tag = (
            soup.find("a", class_="next") or
            soup.find("a", rel="next") or
            soup.find("a", string=re.compile(r"next", re.I))
        )
        if next_tag and next_tag.get("href"):
            href = next_tag["href"]
            url  = href if href.startswith("http") else start_url.rstrip("/") + "/" + href.lstrip("/")
        else:
            url = None
        page += 1
    return all_items

print("Pagination helpers defined.")

# Demo: pagination by page number, applied to a stable teaching site.
#
# In your research the same pattern adapts to: ECB speeches, Fed press releases,
# parliamentary records, news archives — anywhere a list is paginated as
# /page/1, /page/2, … just change the URL template and the CSS selectors.
#
# (We use quotes.toscrape.com instead of a real central-bank archive because
# central-bank portals get redesigned every few years — don't tie a
# teaching example to selectors that go stale. For your research scraper,
# always inspect the live page first with DevTools.)

QUOTES_URL = "https://quotes.toscrape.com/page/{page}/"

def scrape_quotes_paginated(max_pages=10):
    rows = []
    for page in range(1, max_pages + 1):
        r = safe_get(QUOTES_URL.format(page=page))
        if r is None:
            break
        soup   = BeautifulSoup(r.text, "lxml")
        quotes = soup.select("div.quote")
        if not quotes:
            print(f"  Page {page}: empty — stopping.")
            break
        for q in quotes:
            rows.append({
                "page":   page,
                "text":   q.select_one("span.text").get_text(strip=True),
                "author": q.select_one("small.author").get_text(strip=True),
                "tags":   ", ".join(t.get_text() for t in q.select("a.tag")),
            })
        print(f"  Page {page}: {len(quotes)} quotes")
    return rows

quote_records = scrape_quotes_paginated(10)
quotes_df     = pd.DataFrame(quote_records)
print(f"\nTotal: {len(quotes_df)} quotes across {quotes_df['page'].max()} pages")
quotes_df.head()

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def make_driver(headless=True):
    """Create a Chrome WebDriver. headless=False opens a visible window (useful for debugging)."""
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1280,900")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36"
    )
    return webdriver.Chrome(options=options)

print("Driver factory defined.")

driver = make_driver(headless=True)

try:
    driver.get("https://www.ecb.europa.eu")
    print("Page title :", driver.title)
    print("Current URL:", driver.current_url)

    # The page source is the HTML *as the browser sees it* — after JS has run.
    # That's what makes Selenium worth the overhead.
    print("HTML length:", len(driver.page_source))
finally:
    driver.quit()

driver = make_driver(headless=True)

try:
    driver.get("https://quotes.toscrape.com/js/")
    # Give the JS a moment to render (in 3.4 we'll do this properly with waits)
    time.sleep(2)

    # Multiple matches — find_elements returns a list (empty if none found)
    quote_texts = driver.find_elements(By.CSS_SELECTOR, "div.quote span.text")
    print(f"Found {len(quote_texts)} quotes.\n")
    for q in quote_texts[:5]:
        print(" -", q.text.strip()[:90])

    # Single match — find_element raises NoSuchElementException if absent
    h1 = driver.find_element(By.TAG_NAME, "h1")
    print("\nPage H1:", h1.text.strip())
finally:
    driver.quit()

driver = make_driver(headless=True)

try:
    driver.get("https://quotes.toscrape.com/js/")

    # Wait up to 10 s for the JS to populate the page with quote elements,
    # then extract the first 5. Without the wait, the page source would
    # contain only the JavaScript shell.
    wait      = WebDriverWait(driver, 10)
    quotes    = wait.until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.quote"))
    )
    print(f"Quotes appeared. Got {len(quotes)} elements; showing first 5:\n")
    for q in quotes[:5]:
        text   = q.find_element(By.CSS_SELECTOR, "span.text").text.strip()
        author = q.find_element(By.CSS_SELECTOR, "small.author").text.strip()
        print(f" — {text[:70]}… ({author})")

except Exception as e:
    print(f"Failed: {type(e).__name__}: {e}")

finally:
    driver.quit()

driver = make_driver(headless=True)

options = Options()
# options.add_argument("--headless=new")   # un-comment for quiet runs
options.add_argument("--window-size=1280,900")
driver = webdriver.Chrome(options=options)

try:
    time.sleep(4)
    # 1) Open Wikipedia and find the search box
    start_url = "https://en.wikipedia.org/wiki/Main_Page"
    driver.get(start_url)
    box = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.NAME, "search"))
    )
    time.sleep(4)
    # 2) Type the query AND press RETURN in a SINGLE send_keys call.
    #    Why not two separate calls? Wikipedia shows an autocomplete
    #    dropdown the moment we start typing, which re-paints the DOM
    #    around the search box. A second send_keys for Keys.RETURN
    #    arriving immediately afterwards finds the element momentarily
    #    not interactable and raises ElementNotInteractableException.
    #    Sending the whole sequence at once avoids that race.
    box.send_keys("European Central Bank" + Keys.RETURN)
    time.sleep(4)
    # 3) Wait for the search to actually navigate. We must NOT wait for
    #    `#firstHeading` here — that element also exists on Main_Page,
    #    so the wait would pass instantly without the search ever resolving.
    #    Instead, wait for the URL to change.
    WebDriverWait(driver, 10).until(EC.url_changes(start_url))
    time.sleep(4)
    # 4) Now read the article-specific elements
    h1 = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "firstHeading"))
    )
    print("Article :", h1.text.strip())
    print("URL     :", driver.current_url)

    first_p = driver.find_element(By.CSS_SELECTOR, "#mw-content-text p")
    print("\nLead paragraph:")
    print(" ", first_p.text.strip()[:300], "…")

    time.sleep(4)
    # 5) Navigate back to Main_Page, then forward again
    driver.back()
    print("\nAfter back   :", driver.current_url)
    time.sleep(4)
    driver.forward()
    print("After forward:", driver.current_url[:80])

finally:
    time.sleep(10)
    driver.quit()

# Render with Selenium, parse with BeautifulSoup — best of both worlds.
# Target: quotes.toscrape.com/js/ (JS-rendered — `requests` would see only
# the JavaScript shell; Selenium sees the populated DOM).
#
# In your research scraper, just swap the URL and the CSS selectors;
# the structure of the cell stays identical.

driver = make_driver(headless=True)

try:
    driver.get("https://quotes.toscrape.com/js/")
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.quote"))
    )

    # Hand the rendered HTML to BeautifulSoup for clean extraction
    soup = BeautifulSoup(driver.page_source, "lxml")
    rows = []
    for q in soup.select("div.quote"):
        rows.append({
            "text":   q.select_one("span.text").get_text(strip=True),
            "author": q.select_one("small.author").get_text(strip=True),
            "tags":   [t.get_text() for t in q.select("a.tag")],
        })
    df_quotes = pd.DataFrame(rows)
    print(f"Parsed {len(df_quotes)} quotes\n")
    print(df_quotes.head())
finally:
    driver.quit()

# Intercepting JSON APIs — often faster, cleaner, and more stable than
# scraping the rendered HTML. How to find them:
#   DevTools → Network tab → filter XHR/Fetch → reload the page
#   Look for requests that return JSON — those are your real data endpoints.
#
# Below is a working example against the ECB Data Portal SDMX REST endpoint.
# Pattern: data-api.ecb.europa.eu/service/data/{flowRef}/{key}?format=jsondata
# This is the canonical example from the ECB's own API documentation:
# the daily EUR/USD spot exchange rate.

API_URL = (
    "https://data-api.ecb.europa.eu/service/data/"
    "EXR/D.USD.EUR.SP00.A?format=jsondata&lastNObservations=5"
)

r = safe_get(API_URL, pause=1.0)
if r and r.headers.get("Content-Type", "").lower().startswith("application/"):
    try:
        data = r.json()
        # Top-level keys describe the SDMX message envelope
        print("JSON top-level keys:", list(data.keys()))
        # Drill in just enough to show the shape
        if "dataSets" in data:
            print("Number of datasets returned:", len(data["dataSets"]))
            # Pull out the most recent observations
            obs = data["dataSets"][0].get("series", {}).get("0:0:0:0:0", {}).get("observations", {})
            print("Observations (index → [value]):")
            for k, v in list(obs.items())[:5]:
                print(f"  {k}: {v}")
        print("\nKey insight: a single GET returns structured data — no DOM, no waits.")
    except ValueError:
        print("Response was not valid JSON. Content-Type:", r.headers.get("Content-Type"))
else:
    print("No JSON response. Content-Type:",
          r.headers.get("Content-Type") if r else "no response")
    print("\nFallback message: ALWAYS check for an API before writing a Selenium scraper.")
    print("DevTools Network panel will show you whether the page calls one.")

# YOUR CODE HERE — 10 minutes
# Skeleton:
# driver = make_driver(headless=False)
# try:
#     driver.get(...)
#     WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ...)))
#     items = driver.find_elements(By.CSS_SELECTOR, ...)
#     records = [{"...": it.text, ...} for it in items[:10]]
#     pd.DataFrame(records)
# finally:
#     driver.quit()

# Solution — Option A: infinite-scroll scraper
#
# Strategy:
#   1. Scroll to the bottom.
#   2. Wait briefly for new quotes to load (network round-trip).
#   3. Re-count quotes; if the count didn't grow, we've hit the end.
#   4. Stop after `target` quotes or when the page stops producing new ones.

driver = make_driver(headless=False)
QUOTES_CSS = "div.quote"
target     = 30

try:
    driver.get("https://quotes.toscrape.com/scroll")

    # Wait for the first batch of quotes
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, QUOTES_CSS))
    )

    seen = 0
    while True:
        quotes = driver.find_elements(By.CSS_SELECTOR, QUOTES_CSS)
        if len(quotes) >= target:
            break
        if len(quotes) == seen:
            # Scrolled but no new content arrived — we've hit the bottom.
            print(f"  No new quotes after scrolling — stopping at {seen}.")
            break
        seen = len(quotes)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)   # give the JS time to fetch the next batch

    # Extract structured data
    records = []
    for q in driver.find_elements(By.CSS_SELECTOR, QUOTES_CSS):
        records.append({
            "text":   q.find_element(By.CSS_SELECTOR, "span.text").text,
            "author": q.find_element(By.CSS_SELECTOR, "small.author").text,
            "tags":   [t.text for t in q.find_elements(By.CSS_SELECTOR, "a.tag")],
        })
    df_quotes = pd.DataFrame(records)
    print(f"\nCollected {len(df_quotes)} quotes")
    print(df_quotes.head())

finally:
    driver.quit()

# Solution — Option B: paginated JS-rendered site

from selenium.common.exceptions import TimeoutException

driver = make_driver(headless=False)

try:
    rows = []
    for page in range(1, 6):                      # pages 1..5
        driver.get(f"https://quotes.toscrape.com/js/page/{page}/")

        # Wait briefly for at least one quote to render. If none appears,
        # we've gone past the last page — stop cleanly.
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.quote"))
            )
        except TimeoutException:
            print(f"  Page {page}: no quotes — stopping.")
            break

        for q in driver.find_elements(By.CSS_SELECTOR, "div.quote"):
            rows.append({
                "page":   page,
                "text":   q.find_element(By.CSS_SELECTOR, "span.text").text,
                "author": q.find_element(By.CSS_SELECTOR, "small.author").text,
                "tags":   [t.text for t in q.find_elements(By.CSS_SELECTOR, "a.tag")],
            })
        print(f"  Page {page}: cumulative total = {len(rows)}")

    df_quotes = pd.DataFrame(rows)
    print(f"\nCollected {len(df_quotes)} quotes across {df_quotes['page'].max()} pages")
    print(df_quotes.head())

finally:
    driver.quit()

# Build ecb_corpus.csv from a hardcoded list of stable ECB document URLs.

ECB_DOCS = [
    # (date, url) — press release vs. press-conference statement, mixed.
    ("2024-01-25", "https://www.ecb.europa.eu/press/pr/date/2024/html/ecb.mp240125~f738889bde.en.html"),
    ("2024-12-12", "https://www.ecb.europa.eu/press/press_conference/monetary-policy-statement/2024/html/ecb.is241212~ce143b3bc8.en.html"),
    ("2025-10-30", "https://www.ecb.europa.eu/press/press_conference/monetary-policy-statement/2025/html/ecb.is251030~4f74dde15e.en.html"),
    ("2023-09-14", "https://www.ecb.europa.eu/press/press_conference/monetary-policy-statement/2023/html/ecb.is230914~686786984a.en.html"),
    ("2023-05-04", "https://www.ecb.europa.eu/press/pr/date/2023/html/ecb.mp230504~cdfd11a697.en.html"),
]

def parse_ecb_document(url, date_str):
    """Fetch one ECB document and extract title + body.

    Returns a dict (or None if the fetch fails). Selectors are written
    defensively because the ECB site has multiple page templates:
      - <main id="main-content"> is consistent across templates
      - <h1> is the document title
      - body paragraphs are the <p> tags inside <main>, filtered for length
    """
    r = safe_get(url, pause=1.5)
    if r is None:
        return None

    soup = BeautifulSoup(r.text, "lxml")
    main = soup.find("main", id="main-content") or soup.find("main") or soup

    h1 = main.find("h1")
    title = h1.get_text(strip=True) if h1 else "(no title)"

    # Collect substantive paragraphs only — skip very short ones (often
    # navigation labels, image captions, or 'Related topics' snippets).
    paragraphs = [
        p.get_text(" ", strip=True)
        for p in main.find_all("p")
        if len(p.get_text(strip=True)) >= 40
    ]
    body = "\n\n".join(paragraphs)

    return {
        "date_parsed": date_str,
        "title":       title,
        "url":          url,
        "body":         body,
    }

records = []
for date_str, url in ECB_DOCS:
    rec = parse_ecb_document(url, date_str)
    if rec is None:
        print(f"  SKIP {date_str}: fetch failed")
        continue
    records.append(rec)
    print(f"  OK   {date_str}: {len(rec['body'].split())} words — {rec['title'][:50]}…")

if not records:
    raise RuntimeError("No documents could be fetched. Check your network connection.")

corpus_df = pd.DataFrame(records)
corpus_df["date_parsed"] = pd.to_datetime(corpus_df["date_parsed"])
corpus_df = corpus_df.sort_values("date_parsed").reset_index(drop=True)

# Persist to disk — Section 4 will pick this up on the next cell run.
corpus_df.to_csv("ecb_corpus.csv", index=False)
print(f"\nSaved ecb_corpus.csv with {len(corpus_df)} documents.")
print(f"Date range: {corpus_df['date_parsed'].min().date()} → {corpus_df['date_parsed'].max().date()}")
corpus_df[["date_parsed", "title"]].head()

try:
    corpus_df = pd.read_csv("ecb_corpus.csv")
    print(f"Loaded corpus: {len(corpus_df)} documents")
except FileNotFoundError:
    # Synthetic fallback: 8 ECB-style monetary-policy press releases spanning
    # three years. Bodies are kept around 80–120 words — long enough
    # to pass the MIN_WORDS filter applied later, short enough to read.
    corpus_df = pd.DataFrame({
        "date_parsed": [
            "2024-01-25", "2024-04-11", "2024-09-12",
            "2023-10-26", "2023-09-14", "2023-06-15",
            "2022-12-15", "2022-09-08",
        ],
        "title": ["Monetary policy decisions"] * 8,
        "body": [
            # 2024-01-25
            "The Governing Council today decided to keep the three key ECB interest rates unchanged. "
            "The deposit facility rate remains at 4.00%, the main refinancing operations rate at 4.50% "
            "and the marginal lending facility rate at 4.75%. Inflation is projected to decline gradually "
            "further over 2024 but will remain above the 2% target for most of the year. The Council "
            "reiterated that future decisions will ensure that policy rates are set at sufficiently "
            "restrictive levels for as long as necessary to achieve a timely return of inflation to the "
            "medium-term target. Future decisions will continue to follow a data-dependent approach.",
            # 2024-04-11
            "The Governing Council today decided to keep the three key ECB interest rates unchanged. "
            "Incoming information has broadly confirmed the Governing Council's previous assessment of "
            "the medium-term inflation outlook. Inflation has continued to fall, led by lower food and "
            "goods inflation. Most measures of underlying inflation are easing, wage growth is gradually "
            "moderating and firms are absorbing part of the rise in labour costs in their profits. "
            "Financing conditions remain restrictive and past interest rate increases continue to weigh "
            "on demand, helping to push down inflation toward the target.",
            # 2024-09-12
            "The Governing Council today decided to lower the deposit facility rate by 25 basis points. "
            "Based on the Governing Council's updated assessment of the inflation outlook, the dynamics "
            "of underlying inflation, and the strength of monetary policy transmission, it is now "
            "appropriate to take another step in moderating the degree of monetary policy restriction. "
            "Recent inflation data have come in broadly as expected, and the latest staff projections "
            "confirm the previous inflation outlook. Domestic inflation remains high as wages are still "
            "rising at an elevated pace.",
            # 2023-10-26
            "The Governing Council today decided to keep the three key ECB interest rates unchanged. "
            "The incoming information has broadly confirmed the previous assessment of the medium-term "
            "inflation outlook. Inflation is still expected to stay too high for too long, and domestic "
            "price pressures remain strong. At the same time, inflation dropped markedly in September, "
            "including due to strong base effects, and most measures of underlying inflation have "
            "continued to ease. The Governing Council's past interest rate increases continue to be "
            "transmitted forcefully into financing conditions.",
            # 2023-09-14
            "The Governing Council today decided to raise the three key ECB interest rates by 25 basis "
            "points. The deposit facility rate will be increased to 4.00%. Inflation continues to decline "
            "but is still expected to remain too high for too long. The Governing Council is determined "
            "to ensure that inflation returns to its 2% medium-term target in a timely manner. Based on "
            "the current assessment, the Governing Council considers that the key ECB interest rates "
            "have reached levels that, maintained for a sufficiently long duration, will make a "
            "substantial contribution to the timely return of inflation to the target.",
            # 2023-06-15
            "The Governing Council today decided to raise the three key ECB interest rates by 25 basis "
            "points. The deposit facility rate will be increased to 3.50%. Inflation has been coming "
            "down but is projected to remain too high for too long. The Governing Council is determined "
            "to ensure the timely return of inflation to the 2% medium-term target. The Council will "
            "continue to follow a data-dependent approach to determining the appropriate level and "
            "duration of restriction.",
            # 2022-12-15
            "The Governing Council today decided to raise the three key ECB interest rates by 50 basis "
            "points. Based on the substantial upward revision to the inflation outlook, the deposit "
            "facility rate will be increased to 2.00%. The Governing Council judges that interest rates "
            "will still have to rise significantly at a steady pace to reach levels that are sufficiently "
            "restrictive to ensure a timely return of inflation to the 2% medium-term target. Keeping "
            "interest rates at restrictive levels will over time reduce inflation by dampening demand.",
            # 2022-09-08
            "The Governing Council today decided to raise the three key ECB interest rates by 75 basis "
            "points. This major step frontloads the transition from the prevailing highly accommodative "
            "level of policy rates towards levels that will ensure the timely return of inflation to "
            "the 2% medium-term target. Inflation remains far too high and is likely to stay above target "
            "for an extended period. In August, euro area inflation reached 9.1%. Soaring energy and food "
            "prices, demand pressures in some sectors owing to the reopening of the economy, and supply "
            "bottlenecks are still driving up inflation.",
        ],
    })
    corpus_df["date_parsed"] = pd.to_datetime(corpus_df["date_parsed"])
    print(f"Using synthetic corpus ({len(corpus_df)} documents).")
corpus_df.head()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"<[^>]+>", " ", text)
    text = text.replace("&amp;", "&").replace("&nbsp;", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_metadata(df):
    df = df.copy()
    df["body_clean"] = df["body"].apply(clean_text)
    df["n_chars"]    = df["body_clean"].str.len()
    df["n_words"]    = df["body_clean"].str.split().str.len()
    if "date_parsed" in df.columns:
        df["date_parsed"] = pd.to_datetime(df["date_parsed"], errors="coerce")
        df["year"]  = df["date_parsed"].dt.year
        df["month"] = df["date_parsed"].dt.month
    return df

corpus_df = extract_metadata(corpus_df)
corpus_df[["date_parsed","n_chars","n_words"]].describe().round(1)

MIN_WORDS = 50

corpus_df["is_valid"] = corpus_df["n_words"] >= MIN_WORDS
print(f"Valid documents (>= {MIN_WORDS} words): {corpus_df['is_valid'].sum()} / {len(corpus_df)}")

invalid = corpus_df[~corpus_df["is_valid"]][["date_parsed","title","n_words"]]
if len(invalid) > 0:
    print("\nDocuments flagged for review:")
    print(invalid.to_string(index=False))

clean_corpus = corpus_df[corpus_df["is_valid"]].reset_index(drop=True)
clean_corpus.to_csv("corpus_clean.csv", index=False)
print(f"Saved: corpus_clean.csv ({len(clean_corpus)} documents)")
print(f"  Avg words: {clean_corpus['n_words'].mean():.0f}")
print(f"  Total words: {clean_corpus['n_words'].sum():,}")

# Corpus self-check
my_corpus = pd.read_csv("corpus_clean.csv")
print("=== CORPUS SUMMARY ===")
print(f"  Documents : {len(my_corpus)}")
print(f"  Columns   : {list(my_corpus.columns)}")
if "date_parsed" in my_corpus.columns:
    dates = pd.to_datetime(my_corpus["date_parsed"], errors="coerce")
    print(f"  Date range: {dates.min().date()} – {dates.max().date()}")
if "n_words" in my_corpus.columns:
    print(f"  Avg words : {my_corpus['n_words'].mean():.0f}")
my_corpus.head(3)

# Task 1 — Selenium rewrite (if needed)
# YOUR CODE HERE

# Task 2 — additional metadata
# YOUR CODE HERE

# Task 3 — corpus overview figure
# YOUR CODE HERE

# ── SOLUTION ──────────────────────────────────────────────────────────────────
import pandas as pd, matplotlib.pyplot as plt, re

corpus = pd.read_csv("corpus_clean.csv")
corpus["date_parsed"] = pd.to_datetime(corpus["date_parsed"], errors="coerce")
corpus["year"] = corpus["date_parsed"].dt.year

# Task 2
def count_sentences(text):
    if not isinstance(text, str): return 0
    return len([s for s in re.split(r"[.!?]+", text) if s.strip()])

corpus["n_sentences"]  = corpus["body_clean"].apply(count_sentences)
corpus["avg_sent_len"] = (corpus["n_words"] / corpus["n_sentences"].replace(0, 1)).round(1)
corpus["mentions_rates"] = corpus["body_clean"].str.contains(
    r"interest rate|deposit facility", case=False, regex=True, na=False
)
print(corpus[["date_parsed","n_words","n_sentences","avg_sent_len","mentions_rates"]].head())

# Task 3
by_year = corpus.groupby("year").agg(
    n_docs    = ("title",   "count"),
    avg_words = ("n_words", "mean"),
).reset_index()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.bar(by_year["year"], by_year["n_docs"], color="steelblue", edgecolor="white")
ax1.set_title("Documents per year"); ax1.set_xlabel("Year"); ax1.set_ylabel("Count")

ax2.plot(by_year["year"], by_year["avg_words"], marker="o", color="tomato", linewidth=2)
ax2.set_title("Avg word count per year"); ax2.set_xlabel("Year"); ax2.set_ylabel("Words")

fig.suptitle("ECB Corpus Overview", fontsize=13, y=1.02)
fig.tight_layout()
fig.savefig("corpus_overview.png", dpi=300, bbox_inches="tight")
plt.show()
print("Saved: corpus_overview.png")

# locators.py — all CSS/XPath selectors live here
from selenium.webdriver.common.by import By

class HomePageLocators:
    SEARCH_BOX = (By.NAME, "q")
    GO_BUTTON  = (By.ID, "submit")

class ResultsPageLocators:
    NO_RESULTS = (By.XPATH, "//*[contains(text(), 'No results found')]")

# pages.py — one class per page; methods describe user actions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class BasePage:
    def __init__(self, driver, timeout=10):
        self.driver = driver
        self.wait   = WebDriverWait(driver, timeout)

class HomePage(BasePage):
    def title_matches(self, expected):
        return expected in self.driver.title

    def search(self, query):
        box = self.wait.until(EC.element_to_be_clickable(HomePageLocators.SEARCH_BOX))
        box.clear()
        box.send_keys(query)
        self.driver.find_element(*HomePageLocators.GO_BUTTON).click()
        return ResultsPage(self.driver)

class ResultsPage(BasePage):
    def has_results(self):
        return "No results found" not in self.driver.page_source

# test_search.py — reads like prose, no Selenium plumbing in sight
import unittest
from selenium import webdriver

class PythonOrgSearchTest(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Chrome()
        self.driver.get("https://www.python.org")

    def test_search_pycon(self):
        home = HomePage(self.driver)
        self.assertTrue(home.title_matches("Python"))
        results = home.search("pycon")
        self.assertTrue(results.has_results())

    def tearDown(self):
        self.driver.quit()

# To run from a terminal:  python -m unittest test_search.py
# (In a notebook, you can use unittest.main(argv=[''], exit=False))

Pattern	Example URL	Strategy
Page number in URL	`?page=1`, `/page/2`	Loop over integers
Offset in URL	`?start=0`, `?offset=20`	Loop with step
"Next" link in HTML	`<a class="next">Next</a>`	Follow links until absent

`By.*`	What it matches	Example
`ID`	element with `id="..."`	`find_element(By.ID, "search")`
`NAME`	element with `name="..."` (typical for form inputs)	`find_element(By.NAME, "q")`
`CLASS_NAME`	element with single class name	`find_element(By.CLASS_NAME, "result")`
`TAG_NAME`	tag name	`find_elements(By.TAG_NAME, "article")`
`LINK_TEXT`	`<a>` with exact visible text	`find_element(By.LINK_TEXT, "Press releases")`
`PARTIAL_LINK_TEXT`	`<a>` containing the text	`find_element(By.PARTIAL_LINK_TEXT, "press")`
`CSS_SELECTOR`	any CSS selector	`find_elements(By.CSS_SELECTOR, "dl.ecb-basicList dd a")`
`XPATH`	any XPath expression	`find_element(By.XPATH, "//h1[@class='title']")`

Condition	Meaning
`EC.presence_of_element_located((By.X, val))`	Element exists in the DOM (may not yet be visible)
`EC.visibility_of_element_located((By.X, val))`	Element is in the DOM and visible
`EC.element_to_be_clickable((By.X, val))`	Element is visible and enabled
`EC.text_to_be_present_in_element((By.X, val), "text")`	The element contains the given text
`EC.url_contains("path")`	The current URL contains the substring

Column	Description
`date_parsed`	Document date as `datetime`
`title`	Document title or headline
`body_clean`	Cleaned body text
`url`	Source URL
`n_words`	Word count

Situation	Tool
Static HTML	`requests` + `BeautifulSoup`
Need cookies / session	`requests.Session()`
JavaScript-rendered content	`selenium` (4.6+, no `webdriver-manager` needed)
Site loads data via JSON API	`requests.get(api_url).json()` — try this first!
Pagination by page number	Loop over integers in URL
Pagination by "Next" link	Follow `<a rel="next">`
Element location	`driver.find_element(By.CSS_SELECTOR, "...")`
Avoid race conditions	`WebDriverWait` + `EC.*` (never `time.sleep` for waits)
Maintainable scraper	Page Object Model (Section 7)

Lecture 6 — Web Scraping II: Sessions, Headers & Selenium¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

Prerequisites¶

Troubleshooting¶

0. Warm-up — when `requests` isn't enough¶

Demo — same page, two approaches¶

A quick word on what Selenium can do — and what you should do¶

1. Sessions and cookies¶

3. Selenium — the in-depth tour¶

When do you need Selenium?¶

When you do NOT need Selenium¶

3.1 Driver setup¶

3.2 Your first browser session — open, navigate, inspect¶

3.3 Locating elements: `By.*` and `find_element[s]`¶

3.4 Explicit waits — `WebDriverWait` + `expected_conditions`¶

3.5 Interactions: clicks, keyboard input, navigation¶

3.6 Selenium + BeautifulSoup — best of both worlds¶

3.7 The faster route — intercepting JSON APIs¶

⏱ Ten-minute challenge — your first Selenium scraper¶

3.8 Putting it together — build a real corpus¶

4. Building and cleaning a text corpus¶

5. Milestone — Your corpus is ready¶

6. Exercise¶

7. (optional) From script to production: the Page Object Model¶

Summary¶

Next lecture¶

Lecture 6 — Web Scraping II: Sessions, Headers & Selenium¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

Prerequisites¶

Troubleshooting¶

0. Warm-up — when requests isn't enough¶

Demo — same page, two approaches¶

A quick word on what Selenium can do — and what you should do¶

1. Sessions and cookies¶

1.1 When a site requires login¶

2. Handling pagination systematically¶

3. Selenium — the in-depth tour¶

When do you need Selenium?¶

When you do NOT need Selenium¶

3.1 Driver setup¶

3.2 Your first browser session — open, navigate, inspect¶

3.3 Locating elements: By.* and find_element[s]¶

3.4 Explicit waits — WebDriverWait + expected_conditions¶

3.5 Interactions: clicks, keyboard input, navigation¶

3.6 Selenium + BeautifulSoup — best of both worlds¶

3.7 The faster route — intercepting JSON APIs¶

⏱ Ten-minute challenge — your first Selenium scraper¶

3.8 Putting it together — build a real corpus¶

4. Building and cleaning a text corpus¶

5. Milestone — Your corpus is ready¶

6. Exercise¶

7. (optional) From script to production: the Page Object Model¶

Summary¶

Next lecture¶

0. Warm-up — when `requests` isn't enough¶

3.3 Locating elements: `By.*` and `find_element[s]`¶

3.4 Explicit waits — `WebDriverWait` + `expected_conditions`¶