Scrapy
2
自訂代理中間件

# middlewares.py
from scrapy import signals
class AutoProxyMiddleware:
def __init__(self, proxy):
self.proxy = proxy
@classmethod
def from_crawler(cls, crawler):
# 從 settings.py 中讀取代理配置
proxy = crawler.settings.get('HTTP_PROXY')
mw = cls(proxy)
crawler.signals.connect(mw.spider_opened, signals.spider_opened)
return mw
def process_request(self, request, spider):
# 為每個請求設定代理(如果尚未設定)
if self.proxy and 'proxy' not in request.meta:
request.meta['proxy'] = self.proxy
# 無需手動添加 'Proxy-Authorization' 標頭。
# Scrapy 內建的 HttpProxyMiddleware 會因為代理 URL 中包含憑證,
# 而自動處理認證。
def spider_opened(self, spider):
spider.logger.info(f'AutoProxyMiddleware enabled, proxy: {self.proxy}')3
配置代理

# 設定您的代理(請將代理資訊替換為您的實際憑證)
HTTP_PROXY = 'http://你的使用者名稱:你的密碼@你的代理主機:你的連接埠'
# 配置下載器中間件
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.AutoProxyMiddleware': 749, # 自訂中間件,優先級高於內建的 HttpProxyMiddleware。請將 myproject 替換為您的 Scrapy 專案名稱。
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
}4
驗證代理
# test_proxy.py
import scrapy
import json
class TestProxySpider(scrapy.Spider):
name = 'test_proxy'
def start_requests(self):
yield scrapy.Request(
url='https://ipinfo.io/json',
callback=self.parse
) # 無需手動為代理添加 'meta',中間件會自動處理。
def parse(self, response):
data = json.loads(response.text)
print(json.dumps(data, indent=2, ensure_ascii=False))# 範例
{
"ip": "67.72.110.148",
"city": "Tampa",
"region": "Florida",
"country": "US",
"loc": "27.9475,-82.4584",
"org": "AS23089 Hotwire Communications",
"postal": "33606",
"timezone": "America/New_York",
"readme": "https://ipinfo.io/missingauth"
}
最后更新于