scrapy 切换ip和useragent
scrapy middleware设置,注意设置setting文件DOWNLOADER_MIDDLEWARES = {'爬虫名字.middlewares.RandomUserAgentMiddlware': 543,'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,}from scrapy import signa
·
scrapy middleware设置,注意设置setting文件
DOWNLOADER_MIDDLEWARES = {
'爬虫名字.middlewares.RandomUserAgentMiddlware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
middlewares.py文件编写类RandomUserAgentMiddlware
from scrapy import signals
from fake_useragent import UserAgent
class RandomUserAgentMiddlware(object):
'''
随机更换user-agent
模仿并替换site-package/scrapy/downloadermiddlewares源代码中的
useragent.py中的UserAgentMiddleware类
'''
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua = UserAgent()
# 可读取在settings文件中的配置,来决定开源库ua执行的方法,默认是random,也可是ie、Firefox等等
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
# 更换用户代理逻辑在此方法中
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
print(get_ua())
request.headers.setdefault('User-Agent', get_ua())
ip同理
class IpProxyDownloadMiddleware(object):
'''
随机更换IP
'''
def __init__(self, crawler):
super(IpProxyDownloadMiddleware, self).__init__()
def getIP(self, ):
num = 1
while True:
proxy = requests.get('http://127.0.0.1:5010/get/?type=http').json()['proxy']
# 验证IP是否可用
url = "http://ip-api.com/json/" + proxy.split(":")[0] + "?lang=zh-CN"
try:
data = requests.get(url, proxies={"http": proxy}, timeout=10, headers={
'Connection': 'close'})
print('scrapy第%s次尝试验证代理ip %s' % (num, proxy))
num += 1
if data.status_code == 200 :
print('scrapy初始化代理ip %s成功' % proxy)
break
except :
traceback.print_exc()
print('重试')
return 'http://' + proxy
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
# 更换用户代理逻辑在此方法中
def process_request(self, request, spider):
current_ip=random.choice(['http://45.144.3.208:59785' ])#这个ip不会允许scrapy链接
request.meta['proxy'] = current_ip #self.getIP()
print("scrapy.request:",current_ip)
def process_response(self, request, response, spider):
return response
更多推荐

所有评论(0)