14.2 网页爬取与解析基础¶

14.2.1 网络请求库 Requests¶

安装与基础使用¶

# 安装requests库（如果尚未安装）
# pip install requests

import requests

# 发送GET请求示例
def basic_get_request():
    url = "https://httpbin.org/get"
    try:
        response = requests.get(url)
        # 检查响应状态码（200表示成功）
        if response.status_code == 200:
            # 获取JSON格式响应数据
            json_data = response.json()
            print("请求成功！")
            print("响应数据：", json_data)
            return json_data
        else:
            print(f"请求失败，状态码：{response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"请求发生异常：{e}")
        return None

# 带参数的GET请求示例
def get_with_params():
    url = "https://httpbin.org/get"
    params = {
        "page": 1,
        "limit": 10,
        "category": "python"
    }
    try:
        response = requests.get(url, params=params)
        print(f"请求URL：{response.url}")  # 查看完整URL（包含参数）
        print("响应数据：", response.json())
        return response.json()
    except Exception as e:
        print(f"错误：{e}")

# 发送POST请求示例
def basic_post_request():
    url = "https://httpbin.org/post"
    data = {
        "name": "John Doe",
        "age": 30,
        "hobbies": ["reading", "coding"]
    }
    try:
        response = requests.post(url, json=data)  # 使用json参数自动设置Content-Type为application/json
        print("POST请求响应：", response.json())
        return response.json()
    except Exception as e:
        print(f"错误：{e}")

# 处理请求头和cookies
def request_with_headers_and_cookies():
    url = "https://httpbin.org/headers"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"
    }
    cookies = {
        "session_id": "abc123",
        "theme": "dark"
    }
    try:
        response = requests.get(
            url, 
            headers=headers, 
            cookies=cookies,
            timeout=5  # 设置超时时间（秒）
        )
        print("请求头信息：", response.json()["headers"])
        print("Cookie信息：", response.json()["cookies"])
        return response.json()
    except Exception as e:
        print(f"错误：{e}")

# 保存响应内容到文件
def save_response_to_file():
    url = "https://httpbin.org/image/png"
    try:
        response = requests.get(url, stream=True)  # 使用stream=True分块下载大文件
        if response.status_code == 200:
            with open("image.png", "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print("图片已保存为image.png")
            return True
        else:
            print(f"请求失败，状态码：{response.status_code}")
            return False
    except Exception as e:
        print(f"错误：{e}")
        return False

# 超时和重试机制
def request_with_retries():
    url = "https://httpbin.org/delay/2"  # 模拟2秒延迟响应的URL
    try:
        # 设置超时时间和重试机制
        response = requests.get(
            url,
            timeout=3,  # 超时时间3秒
            retries=3,  # 重试次数（需配合重试库，如tenacity或retrying）
            backoff_factor=1  # 重试间隔因子（1, 2, 4秒等）
        )
        print(f"请求成功，状态码：{response.status_code}")
        return response.json()
    except requests.exceptions.Timeout:
        print("请求超时，已自动重试")
    except Exception as e:
        print(f"请求发生异常：{e}")

# 运行示例函数
if __name__ == "__main__":
    basic_get_request()
    get_with_params()
    basic_post_request()
    request_with_headers_and_cookies()
    save_response_to_file()
    request_with_retries()

14.2.2 网页解析库 BeautifulSoup¶

安装与基本使用¶

# 安装BeautifulSoup和解析器（需根据系统选择）
# pip install beautifulsoup4 lxml

from bs4 import BeautifulSoup
import requests

def basic_bs4_demo():
    # 示例HTML内容（可替换为实际网页的响应内容）
    html_content = """
    <html>
    <head>
        <title>BeautifulSoup示例页面</title>
    </head>
    <body>
        <div class="container">
            <h1>Python网页解析</h1>
            <p class="intro">这是一个使用BeautifulSoup解析的示例页面</p>
            <ul class="menu">
                <li><a href="/home">首页</a></li>
                <li><a href="/about">关于</a></li>
                <li><a href="/contact">联系我们</a></li>
            </ul>
            <div class="article">
                <h2>BeautifulSoup教程</h2>
                <p>BeautifulSoup是一个HTML/XML解析器...</p>
                <p>它能从网页中提取数据并处理复杂的HTML结构。</p>
            </div>
            <div class="article">
                <h2>Requests库使用</h2>
                <p>Requests库用于发送HTTP请求...</p>
                <p>它提供了简洁的API来处理各种HTTP方法。</p>
            </div>
        </div>
    </body>
    </html>
    """

    # 创建BeautifulSoup对象，指定解析器（lxml是高效解析器）
    soup = BeautifulSoup(html_content, 'lxml')

    # 基本操作示例
    print("1. 获取页面标题：", soup.title)
    print("   标题文本：", soup.title.string)
    print("   标题标签名称：", soup.title.name)

    print("\n2. 获取所有链接：")
    all_links = soup.find_all('a')  # 查找所有<a>标签
    for link in all_links:
        print(f"   链接文本：{link.string}，URL：{link.get('href')}")

    print("\n3. 获取特定类别的文章：")
    articles = soup.find_all('div', class_='article')  # 查找class为article的div标签
    for article in articles:
        title = article.find('h2').string
        content = article.find('p').string
        print(f"   文章标题：{title}")
        print(f"   文章内容：{content}")

    print("\n4. 获取特定链接：")
    home_link = soup.find('a', string='首页')  # 按文本内容查找链接
    if home_link:
        print(f"   首页链接：{home_link.get('href')}")

# 结合requests获取真实网页内容并解析
def parse_real_website():
    url = "https://example.com"  # 替换为目标网站URL
    try:
        response = requests.get(url)
        response.raise_for_status()  # 如果响应状态码不是200，抛出HTTPError异常

        soup = BeautifulSoup(response.text, 'lxml')

        # 提取网站标题和描述（根据实际网页结构调整选择器）
        title = soup.title.string if soup.title else "无标题"
        description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "无描述"

        print(f"网站标题：{title}")
        print(f"网站描述：{description}")

        # 提取所有图片链接（根据实际网页结构调整选择器）
        images = soup.find_all('img')
        print(f"图片数量：{len(images)}")
        for img in images:
            img_url = img.get('src')
            if img_url and img_url.startswith('/'):  # 处理相对路径
                img_url = f"{url}{img_url}"
            print(f"   图片URL：{img_url}")

        return soup
    except requests.exceptions.RequestException as e:
        print(f"请求错误：{e}")
        return None

# 定位元素的方法
def locate_elements():
    html = """
    <html>
    <body>
        <div id="main-content">
            <p class="para">段落1</p>
            <p class="para">段落2</p>
            <div class="section">
                <p class="para">段落3</p>
                <p class="para">段落4</p>
            </div>
        </div>
    </body>
    </html>
    """
    soup = BeautifulSoup(html, 'lxml')

    # 1. 使用find方法查找第一个匹配元素
    first_para = soup.find('p')
    print(f"第一个p标签：{first_para.string}")

    # 2. 使用find_all方法查找所有匹配元素
    all_paras = soup.find_all('p', class_='para')  # 查找class为para的p标签
    print(f"所有class为para的p标签数量：{len(all_paras)}")
    for para in all_paras:
        print(f"   内容：{para.string}")

    # 3. 使用CSS选择器查找元素（更灵活）
    section_paras = soup.select('div.section p.para')  # 查找class为section的div内的class为para的p标签
    print(f"CSS选择器找到的段落数量：{len(section_paras)}")
    for para in section_paras:
        print(f"   CSS选择器内容：{para.string}")

    # 4. 使用属性查找元素
    main_content = soup.find('div', id='main-content')  # 按id查找元素
    print(f"id为main-content的元素：{main_content.name}")

# 处理嵌套元素和层级关系
def handle_nested_elements():
    html = """
    <html>
    <body>
        <ul class="products">
            <li class="product">
                <h3>Python编程入门</h3>
                <p>作者：张三</p>
                <span class="price">¥59.00</span>
            </li>
            <li class="product">
                <h3>数据分析实战</h3>
                <p>作者：李四</p>
                <span class="price">¥79.00</span>
            </li>
        </ul>
    </body>
    </html>
    """
    soup = BeautifulSoup(html, 'lxml')

    # 获取所有产品信息（嵌套结构）
    products = soup.find_all('li', class_='product')
    for product in products:
        title = product.find('h3').string
        author = product.find('p').string
        price = product.find('span', class_='price').string
        print(f"产品：{title}")
        print(f"  作者：{author}")
        print(f"  价格：{price}")

# 运行示例函数
if __name__ == "__main__":
    basic_bs4_demo()
    locate_elements()
    handle_nested_elements()
    parse_real_website()

14.2.3 数据提取与清洗¶

数据提取与处理¶

```python
from bs4 import BeautifulSoup
import requests
import re

def extract_and_clean_data():
# 1. 基本数据提取与清洗示例（针对实际网页）
html = “”“

Python网络爬虫实战

作者：王小明 ¥69.00

★★★★☆ (128条评价)

本书详细介绍了网络爬虫的开发技术，包含多个实战案例。

“”“
soup = BeautifulSoup(html, ‘lxml’)

# 提取标题（直接提取文本）
title = soup.find('h2').get_text(strip=True)  # strip=True去除首尾空白
print(f"标题：{title}")

# 提取价格（处理特殊格式）
price_tag = soup.find('span', class_='price')
if price_tag:
    price_str = price_tag.get_text(strip=True)
    # 使用正则表达式提取价格数值（去除货币符号和空格）
    price = re.search(r'¥(\d+\.?\d*)', price_str)
    if price:
        price_value = float(price.group(1))
        print(f"价格：{price_value}元")
    else:
        print(f"价格格式无法解析：{price_str}")

# 提取评价数量（处理括号内的数字）
rating_div = soup.find('div', class_='rating')
if rating_div:
    review_count_str = rating_div.find('span', class_='review-count').get_text()
    review_count = re.search(r'\((\d+)条评价\)', review_count_str)
    if review_count:
        review_num = int(review_count.group(1))
        print(f"评价数量：{review_num}")
    else:
        print(f"评价数量格式无法解析：{review_count_str}")

# 2. 复杂数据处理（多层嵌套）
def extract_product_details(html):
    soup = BeautifulSoup(html, 'lxml')
    product = {}

    # 基本信息提取
    product['title'] = soup.find('h2').get_text(strip=True)
    product['author'] = soup.find('p').get_text(strip=True).split()[0]  # 提取作者（假设格式为"作者：XXX"）
    product['price'] = re.search(r'¥(\d+\.?\d*)', soup.find('span', class_='price').get_text()).group(1)
    product['rating'] = float(soup.find('div', class_='rating').get_text().split()[0])  # 提取评分（假设格式为"★★★★☆"）
    product['review_count'] = int(re.search(r'\((\d+)条评价\)', soup.find('span', class_='review-count').get_text()).group(1))
    product['description'] = soup.find('p', class_='desc').get_text(strip=True)

    return product

# 测试产品提取函数
product_html = """
<div class="product">
    <h2>Python网络爬虫实战</h2>
    <p>作者：王小明 <span class="price">¥69.00</span></p>
    <div class="rating">★★★★☆ <span class="review-count">(128条评价)</span></div>
    <p class="desc">本书详细介绍了网络爬虫的开发技术，包含多个实战案例。</p>
</div>
"""
product = extract_product_details(product_html)
print("\n产品详细信息：")
for key, value in product.items():
    print(f"  {key}: {value}")

# 3. 处理表格数据（转换为结构化数据）
def extract_table_data():
    html = """
    <table class="data-table">
        <thead>
            <tr>
                <th>产品名称</th>
                <th>价格</th>
                <th>销量</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>Python编程入门</td>
                <td>¥59.00</td>
                <td>1000+</td>
            </tr>
            <tr>
                <td>数据分析实战</td>
                <td>¥79.00</td>
                <td>850+</td>
            </tr>
        </tbody>
    </table>
    """
    soup = BeautifulSoup(html, 'lxml')
    table = soup.find('table', class_='data-table')
    headers = [th.get_text(strip=True) for th