Python 爬虫实战完全指南:从入门到精通
Python 爬虫是数据采集的核心技能,掌握 Requests、BeautifulSoup、Selenium、Scrapy 等工具,能让你从海量网页中高效提取结构化数据。本文从零开始,带你系统学习 Python 爬虫开发。
一、爬虫核心概念
网络爬虫(Web Crawler),又称网络蜘蛛、网络机器人,是一种按照一定规则自动抓取万维网信息的程序。Python 凭借其丰富的库生态和简洁的语法,成为爬虫开发的首选语言。
爬虫的核心价值:
- 数据采集:从公开网站获取结构化数据,支撑数据分析、机器学习等场景
- 价格监控:实时追踪商品价格变动,辅助商业决策
- 内容聚合:自动收集新闻、文章等内容,构建信息平台
- 竞品分析:抓取公开数据,分析市场趋势和竞争格局
二、核心库详解
1. Requests - HTTP 请求神器
Requests 是 Python 最流行的 HTTP 库,简洁优雅的 API 让网络请求变得极其简单。
import requests
from fake_useragent import UserAgent
# 创建随机 User-Agent
ua = UserAgent()
# 基础 GET 请求
response = requests.get(
"https://example.com/api/data",
headers={"User-Agent": ua.random},
timeout=10
)
# 检查响应状态
if response.status_code == 200:
html = response.text
data = response.json()
# POST 请求
form_data = {"username": "test", "password": "123456"}
response = requests.post(
"https://example.com/login",
data=form_data,
headers={"User-Agent": ua.random}
)
# 使用 Session 保持登录状态
session = requests.Session()
session.post("https://example.com/login", data=form_data)
profile = session.get("https://example.com/profile")
2. BeautifulSoup - HTML 解析利器
BeautifulSoup 将复杂的 HTML 文档转换为易于操作的解析树,支持多种解析器。
from bs4 import BeautifulSoup
import requests
response = requests.get("https://example.com/articles")
soup = BeautifulSoup(response.text, "html.parser")
# 基础选择器
title = soup.find("h1", class_="article-title")
print(title.text.strip())
# find_all() 查找所有匹配元素
articles = soup.find_all("div", class_="article-item")
for article in articles:
title = article.find("h2").text
link = article.find("a")["href"]
print(f"标题: {title}, 链接: {link}")
# CSS 选择器(推荐)
items = soup.select(".article-list article")
for item in items:
title = item.select_one("h2.title").text
author = item.select_one(".author").text
print(f"{title} - {author}")
3. Selenium - 动态内容抓取
当网页内容由 JavaScript 动态生成时,Selenium 模拟浏览器行为抓取动态内容。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get("https://example.com/dynamic-page")
# 等待元素加载
wait = WebDriverWait(driver, 10)
content = wait.until(
EC.presence_of_element_located((By.CLASS_NAME, "article-content"))
)
title = driver.find_element(By.TAG_NAME, "h1").text
print(f"标题: {title}")
# 滚动页面
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# 获取页面源码
page_source = driver.page_source
finally:
driver.quit()
4. Scrapy - 专业爬虫框架
Scrapy 是功能强大的爬虫框架,适用于大规模爬取项目。
import scrapy
class ArticleSpider(scrapy.Spider):
name = "article_spider"
allowed_domains = ["example.com"]
start_urls = ["https://example.com/articles"]
custom_settings = {
"CONCURRENT_REQUESTS": 16,
"DOWNLOAD_DELAY": 0.5,
}
def parse(self, response):
articles = response.css("div.article-item")
for article in articles:
yield {
"title": article.css("h2.title::text").get(),
"author": article.css(".author::text").get(),
"link": article.css("a::attr(href)").get(),
}
# 翻页
next_page = response.css("a.next::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
三、爬虫基本流程
一个完整的爬虫项目包含以下步骤:
1. 分析网站结构
使用浏览器开发者工具(F12)分析目标网站:数据位置、请求方式、反爬机制等。
2. 发送请求获取响应
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
response = session.get("https://example.com/data", timeout=(5, 30))
response.raise_for_status()
3. 解析提取数据
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")
def clean_text(text):
if not text:
return ""
return " ".join(text.split())
data = []
items = soup.select("div.product-item")
for item in items:
product = {
"name": clean_text(item.select_one(".name").text),
"price": item.select_one(".price").text.replace("¥", "").strip(),
}
data.append(product)
4. 数据存储
import json
import csv
# JSON 存储
with open("data.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# CSV 存储
with open("data.csv", "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
四、实战案例:价格监控爬虫
import requests
import sqlite3
import time
from datetime import datetime
from fake_useragent import UserAgent
class PriceMonitor:
def __init__(self, db_path="prices.db"):
self.ua = UserAgent()
self.session = requests.Session()
self.db_path = db_path
self._init_db()
def _init_db(self):
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY,
name TEXT,
url TEXT UNIQUE,
target_price REAL
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS price_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
product_id INTEGER,
price REAL,
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
conn.close()
def add_product(self, name, url, target_price=None):
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute(
"INSERT OR IGNORE INTO products (name, url, target_price) VALUES (?, ?, ?)",
(name, url, target_price)
)
conn.commit()
conn.close()
def fetch_price(self, url):
headers = {"User-Agent": self.ua.random}
try:
response = self.session.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, "html.parser")
price_element = soup.select_one(".price")
if price_element:
return float(price_element.text.replace("¥", "").strip())
except Exception as e:
print(f"获取价格失败: {e}")
return None
def check_prices(self):
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("SELECT id, name, url, target_price FROM products")
for product_id, name, url, target_price in cursor.fetchall():
price = self.fetch_price(url)
if price:
cursor.execute(
"INSERT INTO price_history (product_id, price) VALUES (?, ?)",
(product_id, price)
)
print(f"[{datetime.now()}] {name}: ¥{price}")
if target_price and price <= target_price:
print(f"降价提醒!{name} 已降至 ¥{price}")
time.sleep(2)
conn.commit()
conn.close()
# 使用
monitor = PriceMonitor()
monitor.add_product("示例商品", "https://example.com/product", target_price=99.0)
monitor.check_prices()
五、反爬虫机制应对
1. 请求频率控制
import time
import random
def smart_delay(min_delay=1, max_delay=3):
delay = random.uniform(min_delay, max_delay)
time.sleep(delay)
for url in urls:
response = requests.get(url)
process(response)
smart_delay(2, 5)
2. User-Agent 池
import random
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Chrome/120.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Firefox/121.0",
]
headers = {"User-Agent": random.choice(USER_AGENTS)}
3. IP 代理池
PROXY_POOL = ["http://ip1:port", "http://ip2:port"]
def get_proxy():
return {"http": random.choice(PROXY_POOL), "https": random.choice(PROXY_POOL)}
response = requests.get(url, proxies=get_proxy())
六、最佳实践
- 遵守 robots.txt:检查目标网站的爬虫协议
- 控制频率:避免对服务器造成压力
- 尊重版权:抓取数据仅用于学习研究
- 保护隐私:不抓取涉及个人隐私的信息
总结
Python 爬虫是数据采集的利器,掌握 Requests、BeautifulSoup、Selenium、Scrapy 等核心工具,能让你高效地从海量网页中提取结构化数据。记住:合规使用爬虫技术,既是对他人的尊重,也是对自己的保护。
本文链接:https://www.kkkliao.cn/?id=940 转载需授权!
版权声明:本文由廖万里的博客发布,如需转载请注明出处。



手机流量卡
免费领卡
号卡合伙人
产品服务
关于本站
