Scrapy
2
Custom Proxy Middleware

# middlewares.py
from scrapy import signals
class AutoProxyMiddleware:
def __init__(self, proxy):
self.proxy = proxy
@classmethod
def from_crawler(cls, crawler):
# Read proxy configuration from settings.py
proxy = crawler.settings.get('HTTP_PROXY')
mw = cls(proxy)
crawler.signals.connect(mw.spider_opened, signals.spider_opened)
return mw
def process_request(self, request, spider):
# Set proxy for each request if it hasn't been set already
if self.proxy and 'proxy' not in request.meta:
request.meta['proxy'] = self.proxy
# No need to manually add 'Proxy-Authorization' header.
# Scrapy's built-in HttpProxyMiddleware will handle authentication
# automatically because the credentials are in the proxy URL.
def spider_opened(self, spider):
spider.logger.info(f'AutoProxyMiddleware enabled, proxy: {self.proxy}')3
Configure the Proxy

# Set your proxy (replace the proxy information with your actual credentials)
HTTP_PROXY = 'http://your_username:your_password@your_proxy_host:your_port'
# Configure downloader middleware
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.AutoProxyMiddleware': 749, # Custom middleware, priority higher than the built-in HttpProxyMiddleware. Replace 'myproject' with your Scrapy project name.
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
}4
Verify the Proxy
# test_proxy.py
import scrapy
import json
class TestProxySpider(scrapy.Spider):
name = 'test_proxy'
def start_requests(self):
yield scrapy.Request(
url='https://ipinfo.io/json',
callback=self.parse
) # No need to manually add 'meta' for the proxy; the middleware handles it.
def parse(self, response):
data = json.loads(response.text)
print(json.dumps(data, indent=2, ensure_ascii=False))# Example
{
"ip": "67.72.110.148",
"city": "Tampa",
"region": "Florida",
"country": "US",
"loc": "27.9475,-82.4584",
"org": "AS23089 Hotwire Communications",
"postal": "33606",
"timezone": "America/New_York",
"readme": "https://ipinfo.io/missingauth"
}
Last updated