# Install if needed
# pip install requests beautifulsoup4 lxml

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime

print("Libraries loaded.")

# Live demo — minimal scraper, ~10 lines.
URL = "https://www.federalreserve.gov/newsevents/speech/2024-speeches.htm"
HEADERS = {"User-Agent": "Mozilla/5.0 (teaching; contact: marco.rosso4@unibo.it)"}

resp = requests.get(URL, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")

# Each Fed speech URL follows a recognisable pattern:
# /newsevents/speech/{lastname}{YYYYMMDD}{letter}.htm   e.g. powell20240930a.htm
SPEECH_URL_RE = re.compile(r"/newsevents/speech/[a-z]+\d{8}[a-z]\.htm$")

items = []
for a in soup.find_all("a", href=SPEECH_URL_RE):
    title = a.get_text(strip=True)
    if not title:
        continue
    href = a.get("href", "")
    items.append({
        "title": title,
        "url":   "https://www.federalreserve.gov" + href if href.startswith("/") else href,
    })

df_warmup = pd.DataFrame(items).head(10)
df_warmup

# Always check robots.txt before scraping
robots = requests.get("https://www.federalreserve.gov/robots.txt", headers=HEADERS, timeout=5)
print(robots.text[:500])

<html>
  <head>
    <title>Page title</title>
  </head>
  <body>
    <h1 class="headline">Main heading</h1>
    <div id="content">
      <p>First paragraph.</p>
      <p class="abstract">Second paragraph.</p>
      <a href="https://example.com">A link</a>
    </div>
  </body>
</html>

# A minimal GET request
url = "https://www.federalreserve.gov/newsevents/speech/2024-speeches.htm"

response = requests.get(url)

print(f"Status code : {response.status_code}")
print(f"Content-Type: {response.headers.get('Content-Type', 'n/a')}")
print(f"Body length : {len(response.text):,} characters")
print()
# Show the first 500 characters of raw HTML
print(response.text[:500])

# Always check the status code before proceeding
def safe_get(url, pause=1.0):
    """
    Perform a GET request with a polite pause.
    Returns the response if successful, None otherwise.
    """
    time.sleep(pause)   # be polite — do not hammer the server
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()   # raises HTTPError for 4xx/5xx
        return r
    except requests.HTTPError as e:
        print(f"HTTP error {e.response.status_code} for {url}")
        return None
    except requests.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return None

r = safe_get(url)
print(r.status_code if r else "Failed")

r = requests.get("http://www.federalreserve.gov/newsevents/speech/2024-speeches.htm")
print(f"Final URL  : {r.url}")              # the URL you actually ended up on
print(f"Status     : {r.status_code}")       # 200 (the *final* response)
print(f"Hops taken : {len(r.history)}")      # number of redirects followed
for hop in r.history:
    print(f"  {hop.status_code} {hop.url}  →  {hop.headers.get('Location')}")

# Headers — simulate a browser to avoid 403 errors on some sites
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

r = requests.get(url, headers=HEADERS, timeout=10)
print(r.status_code)

# Query parameters via params= (cleaner than f-strings inside URLs)
demo_params = {"q": "monetary policy", "fromDate": "2024-01-01", "toDate": "2024-12-31"}
prepared = requests.Request("GET", "https://example.com/search", params=demo_params).prepare()
print("Prepared URL:", prepared.url)

# Inspect the response object more thoroughly
r = requests.get(url, headers=HEADERS, timeout=10)
print(f"Final URL (after redirects): {r.url}")
print(f"Status                     : {r.status_code} {r.reason}")
print(f"Encoding                   : {r.encoding}")
print(f"Cookies set by server      : {dict(r.cookies)}")
print(f"Redirect history           : {[h.status_code for h in r.history]}")
print(f"Response time (s)          : {r.elapsed.total_seconds():.3f}")

# Parse the HTML response
soup = BeautifulSoup(r.text, "lxml")

# The soup object is a tree you can navigate
print(type(soup))
print(soup.title.text)   # page title

# Working with a simple HTML string first — so results are predictable
html_example = """
<html>
<body>
  <div class="press-list">
    <dl>
      <dt class="date">7 November 2024</dt>
      <dd>
        <a href="/press/pr/date/2024/html/ecb.pr241107~abc.en.html">
          Monetary policy decisions
        </a>
      </dd>
      <dt class="date">17 October 2024</dt>
      <dd>
        <a href="/press/pr/date/2024/html/ecb.pr241017~xyz.en.html">
          Monetary policy decisions
        </a>
      </dd>
      <dt class="date">12 September 2024</dt>
      <dd>
        <a href="/press/pr/date/2024/html/ecb.pr240912~def.en.html">
          Monetary policy decisions
        </a>
      </dd>
    </dl>
  </div>
</body>
</html>
"""

example_soup = BeautifulSoup(html_example, "lxml")

# find_all — get all date elements
dates = example_soup.find_all("dt", class_="date")
for d in dates:
    print(d.text.strip())

# Extract titles and URLs from the links
links = example_soup.find_all("a")
for link in links:
    title = link.text.strip()
    href  = link.get("href")
    print(f"{title!r:35}  →  {href}")

# CSS selectors — more precise targeting
# "div.press-list a" means: <a> elements inside a <div class="press-list">
for link in example_soup.select("div.press-list a"):
    print(link.text.strip(), "|", link.get("href"))

# Pairing dates with links — navigate sibling elements
records = []
dts = example_soup.find_all("dt", class_="date")
for dt in dts:
    date_str = dt.text.strip()
    dd = dt.find_next_sibling("dd")
    link = dd.find("a")
    records.append({
        "date":  date_str,
        "title": link.text.strip(),
        "url":   "https://www.ecb.europa.eu" + link.get("href"),
    })

pd.DataFrame(records)

# Build a tiny tree we can navigate
nav_html = '''
<article>
  <header>
    <h2>Press release</h2>
    <time>7 November 2024</time>
  </header>
  <section class="body">
    <p class="lede">First paragraph.</p>
    <p>Second paragraph.</p>
    <p>Third paragraph.</p>
  </section>
</article>
'''
nav_soup = BeautifulSoup(nav_html, "lxml")

lede = nav_soup.find("p", class_="lede")
print("Element        :", lede.name, "→", lede.text.strip())
print("Parent         :", lede.parent.name)
print("All ancestors  :", [a.name for a in lede.parents if a.name])
print("Next sibling   :", lede.find_next_sibling().text.strip())
print("All siblings   :", [s.text.strip() for s in lede.find_next_siblings()])

section = nav_soup.find("section")
print("Direct children:", [c.name for c in section.children if c.name])
print("All descendants:", [d.name for d in section.descendants if d.name])

# Step 1 — fetch one year and inspect what we got
ROOT          = "https://www.federalreserve.gov"
YEAR_URL      = "https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm"
SPEECH_URL_RE = re.compile(r"/newsevents/speech/[a-z]+\d{8}[a-z]\.htm$")
DATE_RE       = re.compile(r"(\d{1,2})/(\d{1,2})/(\d{4})")   # MM/DD/YYYY

resp = requests.get(YEAR_URL.format(year=2024), headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")

speech_links = soup.find_all("a", href=SPEECH_URL_RE)
print(f"Speech links found in 2024 archive: {len(speech_links)}\n")

# Inspect the first one and its surroundings
a = speech_links[0]
print("First link:")
print(f"  Title: {a.get_text(strip=True)}")
print(f"  URL  : {a.get('href')}")

# The date is in the page text RIGHT BEFORE each title link.
# Walk backwards to find the nearest MM/DD/YYYY string.
def find_preceding_date(link, max_chars=300):
    """Return the most recent MM/DD/YYYY found *before* this <a> in the document."""
    text_before = ""
    for prev in link.previous_elements:
        if hasattr(prev, "get_text"):
            text_before = prev.get_text(" ", strip=True) + " " + text_before
        elif isinstance(prev, str):
            text_before = prev + " " + text_before
        if len(text_before) > max_chars:
            break
    matches = DATE_RE.findall(text_before)
    return matches[-1] if matches else None   # the LAST match is the closest to the link

print(f"  Date (closest preceding MM/DD/YYYY): {find_preceding_date(a)}")

def parse_fed_year(soup):
    """Extract speech metadata from one Fed year-archive page."""
    records = []
    links   = soup.find_all("a", href=SPEECH_URL_RE)
    for a in links:
        title = a.get_text(strip=True)
        if not title:
            continue
        href = a.get("href", "")
        # Date: walk backwards from the link
        date_match = find_preceding_date(a)
        date_str   = "/".join(date_match) if date_match else ""
        # Speaker + event: walk forwards collecting text, stop at next speech link
        text_after = []
        for nxt in a.next_elements:
            if hasattr(nxt, "name") and nxt.name == "a" and nxt is not a:
                if SPEECH_URL_RE.search(nxt.get("href", "")):
                    break
            if isinstance(nxt, str):
                text_after.append(nxt)
            if sum(len(x) for x in text_after) > 500:
                break
        block = " ".join(text_after)
        # Clean up: collapse whitespace
        block = re.sub(r"\s+", " ", block).strip()
        # Strip the title if it leaked into the block (happens when <a> is nested)
        if title in block:
            block = block.split(title, 1)[-1].strip()
        # Strip the next item's date from the tail
        block = re.sub(r"\d{1,2}/\d{1,2}/\d{4}.*$", "", block).strip()
        # Strip trailing "Watch Live" / "Video" boilerplate
        block = re.sub(r"\b(Watch Live|Video)\b\s*", "", block).strip()
        # Split speaker / event on " At "
        if " At " in block:
            speaker, _, event = block.partition(" At ")
            event = "At " + event
        else:
            speaker, event = block, ""
        records.append({
            "date":    date_str,
            "title":   title,
            "url":     ROOT + href if href.startswith("/") else href,
            "speaker": speaker.strip()[:200],
            "event":   event.strip()[:300],
        })
    return records

# Test on the page we already loaded
records = parse_fed_year(soup)
print(f"Parsed {len(records)} records from 2024\n")
pd.DataFrame(records).head()

def scrape_year(year, pause=1.0):
    """Scrape one full year of Fed speeches."""
    time.sleep(pause)
    url = YEAR_URL.format(year=year)
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        r.raise_for_status()
    except requests.RequestException as e:
        print(f"  {year}: error → {e}")
        return []
    s = BeautifulSoup(r.text, "lxml")
    recs = parse_fed_year(s)
    for rec in recs:
        rec["year"] = year
    return recs

all_records = []
for yr in range(2020, 2025):
    recs = scrape_year(yr)
    all_records.extend(recs)
    print(f"  {yr}: {len(recs)} speeches")

index_df = pd.DataFrame(all_records)
print(f"\nTotal: {len(index_df)} speeches across {len(set(index_df['year']))} years")
index_df.head()

index_df["date_parsed"] = pd.to_datetime(
    index_df["date"], format="%m/%d/%Y", errors="coerce"
)

# Drop rows where date couldn't be parsed (a few stray sidebar/breadcrumb links
# may have matched our URL regex without a preceding date — those are noise).
n_before = len(index_df)
index_df = index_df.dropna(subset=["date_parsed"]).reset_index(drop=True)
n_after  = len(index_df)
print(f"Dropped {n_before - n_after} rows with unparseable dates ({n_after} remaining).\n")

index_df = index_df.sort_values("date_parsed").reset_index(drop=True)
print(index_df.dtypes)
print()
print(index_df[["date_parsed","title","speaker"]].tail(5))

import matplotlib.pyplot as plt

counts = index_df.groupby("year")["title"].count()
fig, ax = plt.subplots(figsize=(8, 4))
counts.plot(kind="bar", ax=ax, color="steelblue", edgecolor="white")
ax.set_title("Federal Reserve Board speeches per year (2020–2024)")
ax.set_xlabel("Year"); ax.set_ylabel("Count")
fig.tight_layout()
plt.show()

def scrape_speech_text(url, pause=1.0):
    """Fetch a single Fed speech page and return its full text."""
    time.sleep(pause)
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        r.raise_for_status()
    except requests.RequestException:
        return ""
    s = BeautifulSoup(r.text, "lxml")
    # Try the most specific container first, fall back gracefully
    article = (s.find("div", id="article") or
               s.find("article") or
               s.find("main") or
               s.find("body"))
    paragraphs = article.find_all("p") if article else []
    return " ".join(p.get_text(" ", strip=True) for p in paragraphs
                    if len(p.get_text(strip=True)) > 20)

# Test on the most recent speech in our index
test_url  = index_df.iloc[-1]["url"]
test_body = scrape_speech_text(test_url)
print(f"URL    : {test_url}")
print(f"Length : {len(test_body):,} characters")
print(f"Preview: {test_body[:400]}...")

sample = index_df.tail(10).reset_index(drop=True).copy()
corpus = []

for i, row in sample.iterrows():
    print(f"  [{i+1:2d}/{len(sample)}] {row['date_parsed'].date()} — {row['speaker'][:40]}")
    body = scrape_speech_text(row["url"])
    rec = row.to_dict()
    rec["body"]    = body
    rec["n_chars"] = len(body)
    corpus.append(rec)

corpus_df = pd.DataFrame(corpus)
print(f"\nCorpus: {len(corpus_df)} documents, avg {corpus_df['n_chars'].mean():.0f} chars each")
corpus_df[["date_parsed","title","speaker","n_chars"]]

corpus_df.to_csv("fed_speeches_corpus.csv", index=False)
print(f"Saved: fed_speeches_corpus.csv ({len(corpus_df)} rows)")

# YOUR CODE HERE — five minutes!
# Hint: build a regex similar to SPEECH_URL_RE but matching:
#   /newsevents/pressreleases/{type}{YYYYMMDD}{letter}.htm
# where {type} is one or more lowercase letters and {letter} is exactly one letter.

URL = "https://www.federalreserve.gov/newsevents/pressreleases/2024-press.htm"

# resp = requests.get(URL, headers=HEADERS, timeout=10)
# soup = BeautifulSoup(resp.text, "lxml")
# PRESS_URL_RE = re.compile(r"...")
# items = []
# for a in soup.find_all("a", href=PRESS_URL_RE):
#     ...
# pd.DataFrame(items[:5])

URL = "https://www.federalreserve.gov/newsevents/pressreleases/2024-press.htm"

resp = requests.get(URL, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")

# Press release URLs: /newsevents/pressreleases/{type}{YYYYMMDD}{letter}.htm
# Same shape as speeches but with a {type} prefix (monetary, enforcement, bcreg, ...)
PRESS_URL_RE = re.compile(r"/newsevents/pressreleases/[a-z]+\d{8}[a-z]\.htm$")

items = []
for a in soup.find_all("a", href=PRESS_URL_RE):
    title = a.get_text(strip=True)
    if not title:
        continue
    href = a.get("href", "")
    items.append({
        "title": title,
        "url":   "https://www.federalreserve.gov" + href if href.startswith("/") else href,
    })

# The Fed lists releases in reverse-chronological order on this page,
# so the 5 most recent are simply the first 5 items.
df_press = pd.DataFrame(items).head(5)
print(f"Total 2024 press releases found: {len(items)}")
df_press

# Always check robots.txt before scraping
robots = requests.get("https://www.federalreserve.gov/robots.txt", headers=HEADERS, timeout=5)
print(robots.text[:500])

def clean_text(text):
    """
    Basic text cleaning pipeline.
    - Remove HTML entities and tags
    - Normalise whitespace
    - Strip leading/trailing whitespace
    """
    if not isinstance(text, str):
        return ""
    # Remove HTML tags (safety net — BeautifulSoup should have handled these)
    text = re.sub(r"<[^>]+>", " ", text)
    # Replace common HTML entities
    text = text.replace("&amp;", "&").replace("&nbsp;", " ").replace("&euro;", "€")
    # Normalise whitespace: collapse multiple spaces/newlines into one space
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Apply to corpus
corpus_df["body_clean"] = corpus_df["body"].apply(clean_text)

# Quick check
for _, row in corpus_df.head(3).iterrows():
    print(f"--- {row['date_parsed'].date()} ---")
    print(row["body_clean"][:200])
    print()

# Quick encoding-detection helper — useful when accents look broken
def detect_and_fix_encoding(response):
    declared = response.encoding
    apparent = response.apparent_encoding   # uses chardet
    if declared.lower() != apparent.lower():
        print(f"  Encoding mismatch: declared={declared}, apparent={apparent}")
        response.encoding = apparent
    return response.text

# Test the page-source heuristic
r = requests.get(url, headers=HEADERS, timeout=10)
text = detect_and_fix_encoding(r)
has_meaningful_html = text.lower().count("<p>") > 5 or text.lower().count("<div") > 20
print(f"Static HTML appears usable: {has_meaningful_html}")

# YOUR CODE HERE
# Suggested structure:

# Step 1: define scraper function(s)

# Step 2: run scraper across target pages

# Step 3: build DataFrame, clean, save to CSV

# Step 4: descriptive analysis (document counts, lengths, top entities)

# ── SOLUTION — Option A (Fed speeches, full corpus 2015–2024) ───────────────────
#
# This cell is INTERRUPT-FRIENDLY and RESUMABLE:
#   • Progress is checkpointed to `fed_corpus_partial.csv` every 50 documents.
#   • Press the stop button (■) any time → progress is saved, no work lost.
#   • Re-run the cell → it skips already-scraped URLs and continues from there.
#   • Final run produces `fed_corpus.csv` with all collected documents.
#
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import time, re, os

ROOT          = "https://www.federalreserve.gov"
YEAR_URL      = "https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm"
SPEECH_URL_RE = re.compile(r"/newsevents/speech/[a-z]+\d{8}[a-z]\.htm$")
DATE_RE       = re.compile(r"(\d{1,2})/(\d{1,2})/(\d{4})")
HEADERS       = {"User-Agent": "Mozilla/5.0 (academic research, contact: yourname@unibo.it)"}

PARTIAL_CSV = "fed_corpus_partial.csv"
FINAL_CSV   = "fed_corpus.csv"

def safe_get(url, pause=1.0):
    time.sleep(pause)
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        r.raise_for_status()
        return r
    except requests.RequestException as e:
        print(f"  Error: {e}")
        return None

def find_preceding_date(link, max_chars=300):
    text_before = ""
    for prev in link.previous_elements:
        if hasattr(prev, "get_text"):
            text_before = prev.get_text(" ", strip=True) + " " + text_before
        elif isinstance(prev, str):
            text_before = prev + " " + text_before
        if len(text_before) > max_chars:
            break
    m = DATE_RE.findall(text_before)
    return "/".join(m[-1]) if m else ""

def parse_fed_year(soup):
    records = []
    for a in soup.find_all("a", href=SPEECH_URL_RE):
        title = a.get_text(strip=True)
        if not title:
            continue
        href = a.get("href", "")
        date_str = find_preceding_date(a)
        text_after = []
        for nxt in a.next_elements:
            if hasattr(nxt, "name") and nxt.name == "a" and nxt is not a and SPEECH_URL_RE.search(nxt.get("href", "")):
                break
            if isinstance(nxt, str):
                text_after.append(nxt)
            if sum(len(x) for x in text_after) > 500:
                break
        block = re.sub(r"\s+", " ", " ".join(text_after)).strip()
        if title in block:
            block = block.split(title, 1)[-1].strip()
        block = re.sub(r"\d{1,2}/\d{1,2}/\d{4}.*$", "", block).strip()
        block = re.sub(r"\b(Watch Live|Video)\b\s*", "", block).strip()
        speaker, _, event = block.partition(" At ")
        event = ("At " + event) if event else ""
        records.append({
            "date":  date_str,
            "title": title,
            "url":   ROOT + href if href.startswith("/") else href,
            "speaker": speaker.strip()[:200],
            "event":   event.strip()[:300],
        })
    return records

def scrape_speech_text(url):
    r = safe_get(url)
    if not r: return ""
    s = BeautifulSoup(r.text, "lxml")
    article = (s.find("div", id="article") or s.find("article") or
               s.find("main") or s.find("body"))
    paras = article.find_all("p") if article else []
    return " ".join(p.get_text(" ", strip=True) for p in paras
                    if len(p.get_text(strip=True)) > 20)

# ── Step 1: build the index ─────────────────────────────────────────────────
all_records = []
for yr in range(2015, 2025):
    r = safe_get(YEAR_URL.format(year=yr))
    if not r: continue
    recs = parse_fed_year(BeautifulSoup(r.text, "lxml"))
    for rec in recs: rec["year"] = yr
    all_records.extend(recs)
    print(f"  {yr}: {len(recs)} speeches")

index_df = pd.DataFrame(all_records)
index_df["date_parsed"] = pd.to_datetime(index_df["date"], format="%m/%d/%Y", errors="coerce")
index_df = index_df.dropna(subset=["date_parsed"]).sort_values("date_parsed").reset_index(drop=True)
print(f"\nIndex: {len(index_df)} speeches\n")

# ── Step 2: scrape full text — RESUMABLE ────────────────────────────────────
# Load any prior partial work
done_urls = set()
prior_records = []
if os.path.exists(PARTIAL_CSV):
    prior = pd.read_csv(PARTIAL_CSV)
    prior_records = prior.to_dict("records")
    done_urls = set(prior["url"].tolist())
    print(f"Resuming: {len(done_urls)} documents already scraped, "
          f"{len(index_df) - len(done_urls)} remaining.\n")

corpus     = list(prior_records)
new_count  = 0
checkpoint = 50

try:
    for i, row in index_df.iterrows():
        if row["url"] in done_urls:
            continue
        body = scrape_speech_text(row["url"])
        rec  = {**row.to_dict(), "body": body, "n_chars": len(body)}
        corpus.append(rec)
        new_count += 1

        # Checkpoint every `checkpoint` new documents
        if new_count % checkpoint == 0:
            pd.DataFrame(corpus).to_csv(PARTIAL_CSV, index=False)
            print(f"  Checkpoint: {len(corpus)}/{len(index_df)} documents saved to {PARTIAL_CSV}")

except KeyboardInterrupt:
    print(f"\n  ⏸  Interrupted at {len(corpus)}/{len(index_df)} documents.")
    pd.DataFrame(corpus).to_csv(PARTIAL_CSV, index=False)
    print(f"  Progress saved to {PARTIAL_CSV}. Re-run the cell to resume.")
    raise   # so you see the traceback and know it stopped

# ── Step 3: finalise ────────────────────────────────────────────────────────
corpus_df = pd.DataFrame(corpus)
corpus_df.to_csv(FINAL_CSV, index=False)
print(f"\n✓ Done: {len(corpus_df)} documents saved to {FINAL_CSV}")

# ── Step 4: descriptive analysis ────────────────────────────────────────────
print("\nSpeeches per year:")
print(corpus_df.groupby("year")["title"].count())

print(f"\nAvg length    : {corpus_df['n_chars'].mean():.0f} chars")
print(f"Median length : {corpus_df['n_chars'].median():.0f} chars")

print("\nTop 5 speakers (by speech count):")
print(corpus_df["speaker"].value_counts().head())

# Plot: speeches per year
fig, ax = plt.subplots(figsize=(10, 4))
corpus_df.groupby("year")["title"].count().plot(
    kind="bar", color="steelblue", edgecolor="white", ax=ax)
ax.set_title("Federal Reserve speeches per year, 2015–2024")
ax.set_xlabel("Year"); ax.set_ylabel("Count")
fig.tight_layout()
fig.savefig("fed_speeches_by_year.png", dpi=300, bbox_inches="tight")
plt.show()

Code	Meaning
200	OK — request succeeded
301/302	Redirect
403	Forbidden — server refuses the request
404	Not found
429	Too many requests — you are being rate-limited
500	Server error

Code	Name	Meaning
`200`	OK	The request succeeded; the body contains what you asked for.
`204`	No Content	Request succeeded but the response body is empty (common for `DELETE` or some APIs).

Code	Name	Meaning
`301`	Moved Permanently	The new URL is the canonical one. Search engines and crawlers should update their records. Use this to update your scraper code to the new URL.
`302`	Found (Temporary)	The redirect is temporary — keep using the original URL in future requests.
`307`	Temporary Redirect	Like `302`, but the HTTP method is preserved (a `POST` stays a `POST` — relevant for APIs).
`308`	Permanent Redirect	Like `301`, with method preserved.

Code	Name	Most common cause when scraping	Fix
`400`	Bad Request	Malformed URL, invalid query parameters	Check the URL and `params=` dict
`401`	Unauthorized	Endpoint requires authentication	Out of scope for public scraping
`403`	Forbidden	Server detected and blocked your `python-requests/x.y` User-Agent, or geo-block, or a more aggressive bot detector (Cloudflare)	Set a browser-style User-Agent; if that fails, the site is actively defending — consider Selenium (L6) or back off
`404`	Not Found	The page no longer exists or the URL pattern changed	Verify the URL in a browser; the site may have restructured
`405`	Method Not Allowed	You used `GET` on an endpoint expecting `POST`	Check the API docs
`429`	Too Many Requests	Rate limiting: you're sending requests faster than the server allows. Often combined with a `Retry-After` header telling you how long to wait.	Slow down — increase `time.sleep()`; consider exponential backoff; in extreme cases use `requests.Session()` with retries (L6)

Code	Name	What you do
`500`	Internal Server Error	The server crashed on your request. Retry once or twice with a delay; if persistent, the URL itself may be problematic.
`502` / `503` / `504`	Bad Gateway / Service Unavailable / Gateway Timeout	Transient infrastructure issues. Retry with exponential backoff (1s → 2s → 4s → 8s).

Method	Returns	Use when
`soup.find(tag, attrs)`	First matching element	You need exactly one element
`soup.find_all(tag, attrs)`	List of all matching elements	You need all elements of a type
`element.text`	Plain text content	Extracting readable text
`element.get("attr")`	Attribute value	Extracting URLs, IDs, classes
`element.select("css_selector")`	List by CSS selector	Precise targeting of nested elements

Symptom	Likely cause	Fix
`soup.find(...)` returns `None` but element is visible in browser	Page renders content via JavaScript after load	L6: use Selenium, or look for the underlying JSON API
`r.status_code == 403`	Server blocks default `python-requests/x.y` user agent	Set a realistic `User-Agent` header (cell 6)
`r.status_code == 429`	Rate-limited — too many requests too fast	Increase `time.sleep()`; use `requests.Session()` (L6)
Garbled accents (`Ã©` instead of `é`)	Encoding mismatch	Set `r.encoding = "utf-8"` before calling `r.text`, or use `r.content` and decode manually
`r.text` ends mid-page	Server closes the connection	Add `timeout=` and retries
Elements present but `.text` is empty	Content is in `data-*` attributes or nested `<noscript>`	Use `el.get("data-content")`, or move to Selenium

Step	Tool	Key concept
Make a request	`requests.get(url)`	Check `.status_code` before using `.text`
Parse HTML	`BeautifulSoup(html, "lxml")`	`find()`, `find_all()`, `select()`
Extract text	`.text.strip()`	Always strip whitespace
Extract attributes	`.get("href")`	Links, IDs, classes
Navigate siblings	`.find_next_sibling()`	Pair dates with content
Be polite	`time.sleep(1.5)`	Avoid 429 errors and server load
Handle errors	`raise_for_status()` + `try/except`	Never assume 200

Lecture 5 — Web Scraping I: requests & BeautifulSoup¶

Python for Economists · University of Bologna · 2025/2026¶

What we cover today¶

0. Warm-up — scraping live, before any theory¶

The ethics moment (5 minutes, non-negotiable)¶

1. How the web works¶

1.1 The HTTP request-response cycle¶

1.2 Status codes¶

1.3 HTML structure¶

2. Making requests with requests¶

📋 HTTP status codes — a scraper's field guide¶

✅ Success (2xx)¶

🔀 Redirects (3xx)¶

❌ Client errors (4xx) — the request is the problem¶

🔥 Server errors (5xx) — the server is the problem¶

🛠 The defensive scraping pattern¶

2.1 Query parameters and the response object¶

3. Parsing HTML with BeautifulSoup¶

3.1 Core methods¶

3.2 Tree navigation: parents, siblings, descendants¶

Interlude — notebook vs. script: when to switch¶

4. Building a scraper: loops, pagination, delays¶

4.1 Inspect first — never scrape blind¶

4.2 A reusable parser¶

4.3 Scale to multiple years¶

4.4 Parse dates and clean up¶

4.5 First descriptive: speeches per year¶

4.6 Scraping the full text of individual speeches¶

4.7 Build the text corpus (sample)¶

4.8 Save the corpus¶

⏱ Five-minute challenge — scrape your first titles¶

5. Ethics and legality of web scraping — the practical handbook¶

5.1 robots.txt — what to look for¶

5.2 Practical rules¶

6. Basic text cleaning¶

7. Diagnosing scraping problems¶

8. Exercise¶

Option A — Fed speeches (recommended if your topic is monetary policy / central bank communication)¶

Option B — Banca d'Italia governor's speeches (recommended if you prefer an Italian source)¶

Option C — your own source¶

Summary¶

Next lecture¶

2. Making requests with `requests`¶

✅ Success (`2xx`)¶

🔀 Redirects (`3xx`)¶

❌ Client errors (`4xx`) — the request is the problem¶

🔥 Server errors (`5xx`) — the server is the problem¶

5.1 `robots.txt` — what to look for¶