14.2 网页爬取与解析基础

14.2.1 网络请求库 Requests

安装与基础使用

# 安装requests库(如果尚未安装)
# pip install requests

import requests

# 发送GET请求示例
def basic_get_request():
    url = "https://httpbin.org/get"
    try:
        response = requests.get(url)
        # 检查响应状态码(200表示成功)
        if response.status_code == 200:
            # 获取JSON格式响应数据
            json_data = response.json()
            print("请求成功!")
            print("响应数据:", json_data)
            return json_data
        else:
            print(f"请求失败,状态码:{response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"请求发生异常:{e}")
        return None

# 带参数的GET请求示例
def get_with_params():
    url = "https://httpbin.org/get"
    params = {
        "page": 1,
        "limit": 10,
        "category": "python"
    }
    try:
        response = requests.get(url, params=params)
        print(f"请求URL:{response.url}")  # 查看完整URL(包含参数)
        print("响应数据:", response.json())
        return response.json()
    except Exception as e:
        print(f"错误:{e}")

# 发送POST请求示例
def basic_post_request():
    url = "https://httpbin.org/post"
    data = {
        "name": "John Doe",
        "age": 30,
        "hobbies": ["reading", "coding"]
    }
    try:
        response = requests.post(url, json=data)  # 使用json参数自动设置Content-Type为application/json
        print("POST请求响应:", response.json())
        return response.json()
    except Exception as e:
        print(f"错误:{e}")

# 处理请求头和cookies
def request_with_headers_and_cookies():
    url = "https://httpbin.org/headers"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"
    }
    cookies = {
        "session_id": "abc123",
        "theme": "dark"
    }
    try:
        response = requests.get(
            url, 
            headers=headers, 
            cookies=cookies,
            timeout=5  # 设置超时时间(秒)
        )
        print("请求头信息:", response.json()["headers"])
        print("Cookie信息:", response.json()["cookies"])
        return response.json()
    except Exception as e:
        print(f"错误:{e}")

# 保存响应内容到文件
def save_response_to_file():
    url = "https://httpbin.org/image/png"
    try:
        response = requests.get(url, stream=True)  # 使用stream=True分块下载大文件
        if response.status_code == 200:
            with open("image.png", "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print("图片已保存为image.png")
            return True
        else:
            print(f"请求失败,状态码:{response.status_code}")
            return False
    except Exception as e:
        print(f"错误:{e}")
        return False

# 超时和重试机制
def request_with_retries():
    url = "https://httpbin.org/delay/2"  # 模拟2秒延迟响应的URL
    try:
        # 设置超时时间和重试机制
        response = requests.get(
            url,
            timeout=3,  # 超时时间3秒
            retries=3,  # 重试次数(需配合重试库,如tenacity或retrying)
            backoff_factor=1  # 重试间隔因子(1, 2, 4秒等)
        )
        print(f"请求成功,状态码:{response.status_code}")
        return response.json()
    except requests.exceptions.Timeout:
        print("请求超时,已自动重试")
    except Exception as e:
        print(f"请求发生异常:{e}")

# 运行示例函数
if __name__ == "__main__":
    basic_get_request()
    get_with_params()
    basic_post_request()
    request_with_headers_and_cookies()
    save_response_to_file()
    request_with_retries()

14.2.2 网页解析库 BeautifulSoup

安装与基本使用

# 安装BeautifulSoup和解析器(需根据系统选择)
# pip install beautifulsoup4 lxml

from bs4 import BeautifulSoup
import requests

def basic_bs4_demo():
    # 示例HTML内容(可替换为实际网页的响应内容)
    html_content = """
    <html>
    <head>
        <title>BeautifulSoup示例页面</title>
    </head>
    <body>
        <div class="container">
            <h1>Python网页解析</h1>
            <p class="intro">这是一个使用BeautifulSoup解析的示例页面</p>
            <ul class="menu">
                <li><a href="/home">首页</a></li>
                <li><a href="/about">关于</a></li>
                <li><a href="/contact">联系我们</a></li>
            </ul>
            <div class="article">
                <h2>BeautifulSoup教程</h2>
                <p>BeautifulSoup是一个HTML/XML解析器...</p>
                <p>它能从网页中提取数据并处理复杂的HTML结构。</p>
            </div>
            <div class="article">
                <h2>Requests库使用</h2>
                <p>Requests库用于发送HTTP请求...</p>
                <p>它提供了简洁的API来处理各种HTTP方法。</p>
            </div>
        </div>
    </body>
    </html>
    """

    # 创建BeautifulSoup对象,指定解析器(lxml是高效解析器)
    soup = BeautifulSoup(html_content, 'lxml')

    # 基本操作示例
    print("1. 获取页面标题:", soup.title)
    print("   标题文本:", soup.title.string)
    print("   标题标签名称:", soup.title.name)

    print("\n2. 获取所有链接:")
    all_links = soup.find_all('a')  # 查找所有<a>标签
    for link in all_links:
        print(f"   链接文本:{link.string},URL:{link.get('href')}")

    print("\n3. 获取特定类别的文章:")
    articles = soup.find_all('div', class_='article')  # 查找class为article的div标签
    for article in articles:
        title = article.find('h2').string
        content = article.find('p').string
        print(f"   文章标题:{title}")
        print(f"   文章内容:{content}")

    print("\n4. 获取特定链接:")
    home_link = soup.find('a', string='首页')  # 按文本内容查找链接
    if home_link:
        print(f"   首页链接:{home_link.get('href')}")

# 结合requests获取真实网页内容并解析
def parse_real_website():
    url = "https://example.com"  # 替换为目标网站URL
    try:
        response = requests.get(url)
        response.raise_for_status()  # 如果响应状态码不是200,抛出HTTPError异常

        soup = BeautifulSoup(response.text, 'lxml')

        # 提取网站标题和描述(根据实际网页结构调整选择器)
        title = soup.title.string if soup.title else "无标题"
        description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "无描述"

        print(f"网站标题:{title}")
        print(f"网站描述:{description}")

        # 提取所有图片链接(根据实际网页结构调整选择器)
        images = soup.find_all('img')
        print(f"图片数量:{len(images)}")
        for img in images:
            img_url = img.get('src')
            if img_url and img_url.startswith('/'):  # 处理相对路径
                img_url = f"{url}{img_url}"
            print(f"   图片URL:{img_url}")

        return soup
    except requests.exceptions.RequestException as e:
        print(f"请求错误:{e}")
        return None

# 定位元素的方法
def locate_elements():
    html = """
    <html>
    <body>
        <div id="main-content">
            <p class="para">段落1</p>
            <p class="para">段落2</p>
            <div class="section">
                <p class="para">段落3</p>
                <p class="para">段落4</p>
            </div>
        </div>
    </body>
    </html>
    """
    soup = BeautifulSoup(html, 'lxml')

    # 1. 使用find方法查找第一个匹配元素
    first_para = soup.find('p')
    print(f"第一个p标签:{first_para.string}")

    # 2. 使用find_all方法查找所有匹配元素
    all_paras = soup.find_all('p', class_='para')  # 查找class为para的p标签
    print(f"所有class为para的p标签数量:{len(all_paras)}")
    for para in all_paras:
        print(f"   内容:{para.string}")

    # 3. 使用CSS选择器查找元素(更灵活)
    section_paras = soup.select('div.section p.para')  # 查找class为section的div内的class为para的p标签
    print(f"CSS选择器找到的段落数量:{len(section_paras)}")
    for para in section_paras:
        print(f"   CSS选择器内容:{para.string}")

    # 4. 使用属性查找元素
    main_content = soup.find('div', id='main-content')  # 按id查找元素
    print(f"id为main-content的元素:{main_content.name}")

# 处理嵌套元素和层级关系
def handle_nested_elements():
    html = """
    <html>
    <body>
        <ul class="products">
            <li class="product">
                <h3>Python编程入门</h3>
                <p>作者:张三</p>
                <span class="price">¥59.00</span>
            </li>
            <li class="product">
                <h3>数据分析实战</h3>
                <p>作者:李四</p>
                <span class="price">¥79.00</span>
            </li>
        </ul>
    </body>
    </html>
    """
    soup = BeautifulSoup(html, 'lxml')

    # 获取所有产品信息(嵌套结构)
    products = soup.find_all('li', class_='product')
    for product in products:
        title = product.find('h3').string
        author = product.find('p').string
        price = product.find('span', class_='price').string
        print(f"产品:{title}")
        print(f"  作者:{author}")
        print(f"  价格:{price}")

# 运行示例函数
if __name__ == "__main__":
    basic_bs4_demo()
    locate_elements()
    handle_nested_elements()
    parse_real_website()

14.2.3 数据提取与清洗

数据提取与处理

```python
from bs4 import BeautifulSoup
import requests
import re

def extract_and_clean_data():
# 1. 基本数据提取与清洗示例(针对实际网页)
html = “”“


Python网络爬虫实战


作者:王小明 ¥69.00


★★★★☆ (128条评价)

本书详细介绍了网络爬虫的开发技术,包含多个实战案例。



“”“
soup = BeautifulSoup(html, ‘lxml’)

# 提取标题(直接提取文本)
title = soup.find('h2').get_text(strip=True)  # strip=True去除首尾空白
print(f"标题:{title}")

# 提取价格(处理特殊格式)
price_tag = soup.find('span', class_='price')
if price_tag:
    price_str = price_tag.get_text(strip=True)
    # 使用正则表达式提取价格数值(去除货币符号和空格)
    price = re.search(r'¥(\d+\.?\d*)', price_str)
    if price:
        price_value = float(price.group(1))
        print(f"价格:{price_value}元")
    else:
        print(f"价格格式无法解析:{price_str}")

# 提取评价数量(处理括号内的数字)
rating_div = soup.find('div', class_='rating')
if rating_div:
    review_count_str = rating_div.find('span', class_='review-count').get_text()
    review_count = re.search(r'\((\d+)条评价\)', review_count_str)
    if review_count:
        review_num = int(review_count.group(1))
        print(f"评价数量:{review_num}")
    else:
        print(f"评价数量格式无法解析:{review_count_str}")

# 2. 复杂数据处理(多层嵌套)
def extract_product_details(html):
    soup = BeautifulSoup(html, 'lxml')
    product = {}

    # 基本信息提取
    product['title'] = soup.find('h2').get_text(strip=True)
    product['author'] = soup.find('p').get_text(strip=True).split()[0]  # 提取作者(假设格式为"作者:XXX")
    product['price'] = re.search(r'¥(\d+\.?\d*)', soup.find('span', class_='price').get_text()).group(1)
    product['rating'] = float(soup.find('div', class_='rating').get_text().split()[0])  # 提取评分(假设格式为"★★★★☆")
    product['review_count'] = int(re.search(r'\((\d+)条评价\)', soup.find('span', class_='review-count').get_text()).group(1))
    product['description'] = soup.find('p', class_='desc').get_text(strip=True)

    return product

# 测试产品提取函数
product_html = """
<div class="product">
    <h2>Python网络爬虫实战</h2>
    <p>作者:王小明 <span class="price">¥69.00</span></p>
    <div class="rating">★★★★☆ <span class="review-count">(128条评价)</span></div>
    <p class="desc">本书详细介绍了网络爬虫的开发技术,包含多个实战案例。</p>
</div>
"""
product = extract_product_details(product_html)
print("\n产品详细信息:")
for key, value in product.items():
    print(f"  {key}: {value}")

# 3. 处理表格数据(转换为结构化数据)
def extract_table_data():
    html = """
    <table class="data-table">
        <thead>
            <tr>
                <th>产品名称</th>
                <th>价格</th>
                <th>销量</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>Python编程入门</td>
                <td>¥59.00</td>
                <td>1000+</td>
            </tr>
            <tr>
                <td>数据分析实战</td>
                <td>¥79.00</td>
                <td>850+</td>
            </tr>
        </tbody>
    </table>
    """
    soup = BeautifulSoup(html, 'lxml')
    table = soup.find('table', class_='data-table')
    headers = [th.get_text(strip=True) for th
Xiaoye