import json
import pandas as pd
import time
from bs4 import BeautifulSoup

api_page_1 = {
    "page": 1,
    "next_page": 2,
    "results": [
        {"id": 101, "title": "Inflation slows", "date": "2026-03-01"},
        {"id": 102, "title": "Exports rise", "date": "2026-03-02"}
    ]
}

api_page_2 = {
    "page": 2,
    "next_page": None,
    "results": [
        {"id": 103, "title": "Jobs data revised", "date": "2026-03-03"}
    ]
}

html_page_1 = '''
<div class="entry" data-id="101"><h2>Inflation slows</h2><span>2026-03-01</span></div>
<div class="entry" data-id="102"><h2>Exports rise</h2><span>2026-03-02</span></div>
'''

html_page_2 = '''
<div class="entry" data-id="103"><h2>Jobs data revised</h2><span>2026-03-03</span></div>
'''

def parse_api_pages(pages):
    rows = []
    for page in pages:
        rows.extend(page["results"])
    return pd.DataFrame(rows)

df_api = parse_api_pages([api_page_1, api_page_2])
df_api

def parse_html_pages(pages):
    rows = []
    for html in pages:
        soup = BeautifulSoup(html, "html.parser")
        for entry in soup.select("div.entry"):
            rows.append({
                "id": int(entry["data-id"]),
                "title": entry.select_one("h2").get_text(strip=True),
                "date": entry.select_one("span").get_text(strip=True)
            })
    return pd.DataFrame(rows)

df_html = parse_html_pages([html_page_1, html_page_2])
df_html

comparison = pd.DataFrame({
    "criterion": ["structure", "fragility", "metadata quality", "pagination logic", "legal/ethical clarity"],
    "API": [
        "explicit fields",
        "usually lower",
        "typically higher",
        "often documented",
        "usually clearer"
    ],
    "HTML scraping": [
        "must infer structure",
        "higher",
        "depends on page design",
        "must reverse engineer",
        "must inspect robots/terms carefully"
    ]
})
comparison

def polite_pagination_fetch(max_pages=3, pause_seconds=0.2):
    pages_fetched = []
    for page in range(1, max_pages + 1):
        pages_fetched.append(page)
        time.sleep(pause_seconds)
    return pages_fetched

polite_pagination_fetch()

Goal¶

Core angle¶

1. Mock API response and mock HTML archive¶

2. Parse the API output¶

3. Parse the HTML output¶

4. Compare the two sources¶

5. Simulate a simple rate limit policy¶

Short exercise¶

Optional extension¶

Extra L6 — APIs vs Scraping, Pagination, and Rate Limits¶

Goal¶

Core angle¶

1. Mock API response and mock HTML archive¶

2. Parse the API output¶

3. Parse the HTML output¶

4. Compare the two sources¶

5. Simulate a simple rate limit policy¶

Short exercise¶

Optional extension¶