from bs4 import BeautifulSoup
import pandas as pd

html_v1 = '''
<html>
  <body>
    <div class="article">
      <h2 class="title">Inflation slows in March</h2>
      <span class="date">2026-03-12</span>
    </div>
    <div class="article">
      <h2 class="title">Unemployment stable</h2>
      <span class="date">2026-03-13</span>
    </div>
  </body>
</html>
'''

html_v2 = '''
<html>
  <body>
    <article class="news-item">
      <h2 data-role="headline">Inflation slows in March</h2>
      <time datetime="2026-03-12">12 March 2026</time>
    </article>
    <article class="news-item">
      <h2 data-role="headline">Unemployment stable</h2>
      <time datetime="2026-03-13">13 March 2026</time>
    </article>
  </body>
</html>
'''

def parse_brittle(html):
    soup = BeautifulSoup(html, "html.parser")
    rows = []
    for block in soup.select("div.article"):
        title = block.select_one("h2.title").get_text(strip=True)
        date = block.select_one("span.date").get_text(strip=True)
        rows.append({"title": title, "date": date})
    return pd.DataFrame(rows)

print(parse_brittle(html_v1))

try:
    print(parse_brittle(html_v2))
except Exception as e:
    print("Parser failed:", repr(e))

def first_text(parent, selectors):
    for sel in selectors:
        node = parent.select_one(sel)
        if node is not None:
            if node.has_attr("datetime"):
                return node["datetime"]
            return node.get_text(strip=True)
    return None

def parse_robust(html):
    soup = BeautifulSoup(html, "html.parser")
    rows = []
    blocks = soup.select("div.article, article.news-item")
    for block in blocks:
        title = first_text(block, ["h2.title", "h2[data-role='headline']", "h2"])
        date = first_text(block, ["span.date", "time[datetime]", "time"])
        rows.append({"title": title, "date": date})
    return pd.DataFrame(rows)

print(parse_robust(html_v1))
print(parse_robust(html_v2))

def parse_with_quality_report(html):
    df = parse_robust(html)
    report = {
        "n_rows": len(df),
        "missing_title": int(df["title"].isna().sum()),
        "missing_date": int(df["date"].isna().sum())
    }
    return df, report

for label, html in {"v1": html_v1, "v2": html_v2}.items():
    df_parsed, report = parse_with_quality_report(html)
    print(label, report)
    display(df_parsed)

Extra L5 — Web Scraping Robustness and HTML Fragility¶

Goal¶

Important note¶

1. Two HTML versions of the same webpage¶

2. A brittle parser¶

3. A more robust parser¶

4. Logging parse quality¶

Short exercise¶

Optional extension¶