14.2 网页爬取与解析基础¶
14.2.1 网络请求库 Requests¶
安装与基础使用¶
# 安装requests库(如果尚未安装)
# pip install requests
import requests
# 发送GET请求示例
def basic_get_request():
url = "https://httpbin.org/get"
try:
response = requests.get(url)
# 检查响应状态码(200表示成功)
if response.status_code == 200:
# 获取JSON格式响应数据
json_data = response.json()
print("请求成功!")
print("响应数据:", json_data)
return json_data
else:
print(f"请求失败,状态码:{response.status_code}")
return None
except requests.exceptions.RequestException as e:
print(f"请求发生异常:{e}")
return None
# 带参数的GET请求示例
def get_with_params():
url = "https://httpbin.org/get"
params = {
"page": 1,
"limit": 10,
"category": "python"
}
try:
response = requests.get(url, params=params)
print(f"请求URL:{response.url}") # 查看完整URL(包含参数)
print("响应数据:", response.json())
return response.json()
except Exception as e:
print(f"错误:{e}")
# 发送POST请求示例
def basic_post_request():
url = "https://httpbin.org/post"
data = {
"name": "John Doe",
"age": 30,
"hobbies": ["reading", "coding"]
}
try:
response = requests.post(url, json=data) # 使用json参数自动设置Content-Type为application/json
print("POST请求响应:", response.json())
return response.json()
except Exception as e:
print(f"错误:{e}")
# 处理请求头和cookies
def request_with_headers_and_cookies():
url = "https://httpbin.org/headers"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"
}
cookies = {
"session_id": "abc123",
"theme": "dark"
}
try:
response = requests.get(
url,
headers=headers,
cookies=cookies,
timeout=5 # 设置超时时间(秒)
)
print("请求头信息:", response.json()["headers"])
print("Cookie信息:", response.json()["cookies"])
return response.json()
except Exception as e:
print(f"错误:{e}")
# 保存响应内容到文件
def save_response_to_file():
url = "https://httpbin.org/image/png"
try:
response = requests.get(url, stream=True) # 使用stream=True分块下载大文件
if response.status_code == 200:
with open("image.png", "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print("图片已保存为image.png")
return True
else:
print(f"请求失败,状态码:{response.status_code}")
return False
except Exception as e:
print(f"错误:{e}")
return False
# 超时和重试机制
def request_with_retries():
url = "https://httpbin.org/delay/2" # 模拟2秒延迟响应的URL
try:
# 设置超时时间和重试机制
response = requests.get(
url,
timeout=3, # 超时时间3秒
retries=3, # 重试次数(需配合重试库,如tenacity或retrying)
backoff_factor=1 # 重试间隔因子(1, 2, 4秒等)
)
print(f"请求成功,状态码:{response.status_code}")
return response.json()
except requests.exceptions.Timeout:
print("请求超时,已自动重试")
except Exception as e:
print(f"请求发生异常:{e}")
# 运行示例函数
if __name__ == "__main__":
basic_get_request()
get_with_params()
basic_post_request()
request_with_headers_and_cookies()
save_response_to_file()
request_with_retries()
14.2.2 网页解析库 BeautifulSoup¶
安装与基本使用¶
# 安装BeautifulSoup和解析器(需根据系统选择)
# pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup
import requests
def basic_bs4_demo():
# 示例HTML内容(可替换为实际网页的响应内容)
html_content = """
<html>
<head>
<title>BeautifulSoup示例页面</title>
</head>
<body>
<div class="container">
<h1>Python网页解析</h1>
<p class="intro">这是一个使用BeautifulSoup解析的示例页面</p>
<ul class="menu">
<li><a href="/home">首页</a></li>
<li><a href="/about">关于</a></li>
<li><a href="/contact">联系我们</a></li>
</ul>
<div class="article">
<h2>BeautifulSoup教程</h2>
<p>BeautifulSoup是一个HTML/XML解析器...</p>
<p>它能从网页中提取数据并处理复杂的HTML结构。</p>
</div>
<div class="article">
<h2>Requests库使用</h2>
<p>Requests库用于发送HTTP请求...</p>
<p>它提供了简洁的API来处理各种HTTP方法。</p>
</div>
</div>
</body>
</html>
"""
# 创建BeautifulSoup对象,指定解析器(lxml是高效解析器)
soup = BeautifulSoup(html_content, 'lxml')
# 基本操作示例
print("1. 获取页面标题:", soup.title)
print(" 标题文本:", soup.title.string)
print(" 标题标签名称:", soup.title.name)
print("\n2. 获取所有链接:")
all_links = soup.find_all('a') # 查找所有<a>标签
for link in all_links:
print(f" 链接文本:{link.string},URL:{link.get('href')}")
print("\n3. 获取特定类别的文章:")
articles = soup.find_all('div', class_='article') # 查找class为article的div标签
for article in articles:
title = article.find('h2').string
content = article.find('p').string
print(f" 文章标题:{title}")
print(f" 文章内容:{content}")
print("\n4. 获取特定链接:")
home_link = soup.find('a', string='首页') # 按文本内容查找链接
if home_link:
print(f" 首页链接:{home_link.get('href')}")
# 结合requests获取真实网页内容并解析
def parse_real_website():
url = "https://example.com" # 替换为目标网站URL
try:
response = requests.get(url)
response.raise_for_status() # 如果响应状态码不是200,抛出HTTPError异常
soup = BeautifulSoup(response.text, 'lxml')
# 提取网站标题和描述(根据实际网页结构调整选择器)
title = soup.title.string if soup.title else "无标题"
description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "无描述"
print(f"网站标题:{title}")
print(f"网站描述:{description}")
# 提取所有图片链接(根据实际网页结构调整选择器)
images = soup.find_all('img')
print(f"图片数量:{len(images)}")
for img in images:
img_url = img.get('src')
if img_url and img_url.startswith('/'): # 处理相对路径
img_url = f"{url}{img_url}"
print(f" 图片URL:{img_url}")
return soup
except requests.exceptions.RequestException as e:
print(f"请求错误:{e}")
return None
# 定位元素的方法
def locate_elements():
html = """
<html>
<body>
<div id="main-content">
<p class="para">段落1</p>
<p class="para">段落2</p>
<div class="section">
<p class="para">段落3</p>
<p class="para">段落4</p>
</div>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html, 'lxml')
# 1. 使用find方法查找第一个匹配元素
first_para = soup.find('p')
print(f"第一个p标签:{first_para.string}")
# 2. 使用find_all方法查找所有匹配元素
all_paras = soup.find_all('p', class_='para') # 查找class为para的p标签
print(f"所有class为para的p标签数量:{len(all_paras)}")
for para in all_paras:
print(f" 内容:{para.string}")
# 3. 使用CSS选择器查找元素(更灵活)
section_paras = soup.select('div.section p.para') # 查找class为section的div内的class为para的p标签
print(f"CSS选择器找到的段落数量:{len(section_paras)}")
for para in section_paras:
print(f" CSS选择器内容:{para.string}")
# 4. 使用属性查找元素
main_content = soup.find('div', id='main-content') # 按id查找元素
print(f"id为main-content的元素:{main_content.name}")
# 处理嵌套元素和层级关系
def handle_nested_elements():
html = """
<html>
<body>
<ul class="products">
<li class="product">
<h3>Python编程入门</h3>
<p>作者:张三</p>
<span class="price">¥59.00</span>
</li>
<li class="product">
<h3>数据分析实战</h3>
<p>作者:李四</p>
<span class="price">¥79.00</span>
</li>
</ul>
</body>
</html>
"""
soup = BeautifulSoup(html, 'lxml')
# 获取所有产品信息(嵌套结构)
products = soup.find_all('li', class_='product')
for product in products:
title = product.find('h3').string
author = product.find('p').string
price = product.find('span', class_='price').string
print(f"产品:{title}")
print(f" 作者:{author}")
print(f" 价格:{price}")
# 运行示例函数
if __name__ == "__main__":
basic_bs4_demo()
locate_elements()
handle_nested_elements()
parse_real_website()
14.2.3 数据提取与清洗¶
数据提取与处理¶
```python
from bs4 import BeautifulSoup
import requests
import re
def extract_and_clean_data():
# 1. 基本数据提取与清洗示例(针对实际网页)
html = “”“
Python网络爬虫实战
作者:王小明 ¥69.00
本书详细介绍了网络爬虫的开发技术,包含多个实战案例。
“”“
soup = BeautifulSoup(html, ‘lxml’)
# 提取标题(直接提取文本)
title = soup.find('h2').get_text(strip=True) # strip=True去除首尾空白
print(f"标题:{title}")
# 提取价格(处理特殊格式)
price_tag = soup.find('span', class_='price')
if price_tag:
price_str = price_tag.get_text(strip=True)
# 使用正则表达式提取价格数值(去除货币符号和空格)
price = re.search(r'¥(\d+\.?\d*)', price_str)
if price:
price_value = float(price.group(1))
print(f"价格:{price_value}元")
else:
print(f"价格格式无法解析:{price_str}")
# 提取评价数量(处理括号内的数字)
rating_div = soup.find('div', class_='rating')
if rating_div:
review_count_str = rating_div.find('span', class_='review-count').get_text()
review_count = re.search(r'\((\d+)条评价\)', review_count_str)
if review_count:
review_num = int(review_count.group(1))
print(f"评价数量:{review_num}")
else:
print(f"评价数量格式无法解析:{review_count_str}")
# 2. 复杂数据处理(多层嵌套)
def extract_product_details(html):
soup = BeautifulSoup(html, 'lxml')
product = {}
# 基本信息提取
product['title'] = soup.find('h2').get_text(strip=True)
product['author'] = soup.find('p').get_text(strip=True).split()[0] # 提取作者(假设格式为"作者:XXX")
product['price'] = re.search(r'¥(\d+\.?\d*)', soup.find('span', class_='price').get_text()).group(1)
product['rating'] = float(soup.find('div', class_='rating').get_text().split()[0]) # 提取评分(假设格式为"★★★★☆")
product['review_count'] = int(re.search(r'\((\d+)条评价\)', soup.find('span', class_='review-count').get_text()).group(1))
product['description'] = soup.find('p', class_='desc').get_text(strip=True)
return product
# 测试产品提取函数
product_html = """
<div class="product">
<h2>Python网络爬虫实战</h2>
<p>作者:王小明 <span class="price">¥69.00</span></p>
<div class="rating">★★★★☆ <span class="review-count">(128条评价)</span></div>
<p class="desc">本书详细介绍了网络爬虫的开发技术,包含多个实战案例。</p>
</div>
"""
product = extract_product_details(product_html)
print("\n产品详细信息:")
for key, value in product.items():
print(f" {key}: {value}")
# 3. 处理表格数据(转换为结构化数据)
def extract_table_data():
html = """
<table class="data-table">
<thead>
<tr>
<th>产品名称</th>
<th>价格</th>
<th>销量</th>
</tr>
</thead>
<tbody>
<tr>
<td>Python编程入门</td>
<td>¥59.00</td>
<td>1000+</td>
</tr>
<tr>
<td>数据分析实战</td>
<td>¥79.00</td>
<td>850+</td>
</tr>
</tbody>
</table>
"""
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table', class_='data-table')
headers = [th.get_text(strip=True) for th