python 爬虫 - UserAgent 身份隐藏/设置代理
1、Error 模块urllib.errorUrlerror产生的原因:没网服务器链接失败链接不到指定的服务器是OSError模块# -*- coding:utf-8 -*-'''UrlError'''from urllib import request, errorif __name__ == '__main__':url = "http:/...
·
1、Error 模块
- urllib.error
- Urlerror产生的原因:
- 没网
- 服务器链接失败
- 链接不到指定的服务器
- 是OSError模块
- Urlerror产生的原因:
# -*- coding:utf-8 -*-
'''
UrlError
'''
from urllib import request, error
if __name__ == '__main__':
url = "http://www.whjewk.com"
try:
req = request.Request(url)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print("URLError: {0} ".format(e.reason))
print("URLError: {0} ".format(e))
except Exception as e:
print(e)
URLError: [Errno -2] Name or service not known
URLError: <urlopen error [Errno -2] Name or service not known>
- HttpError,是URLError的一个子类
- 两者区别:
- httpError 返回是对应的HTTP请求的返回码错误,如果HTTP返回的错误码在400以上,则引发HTTP Error
- URLError 对应的一般是网络出现问题,包括url问题
- 关系区别: OS Error-> URLError -> HttpError
- 两者区别:
# -*- coding:utf-8 -*-
'''
UrlError
'''
from urllib import request, error
if __name__ == '__main__':
url = "http://www.sipo.gov.cn/ewew"
try:
req = request.Request(url)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.HTTPError as e:
print("HttpError: {0} ".format(e.reason))
print("HttpError: {0} ".format(e))
except error.URLError as e:
print("URLError: {0} ".format(e.reason))
print("URLError: {0} ".format(e))
except Exception as e:
print(e)
2、UserAgent 身份隐藏
-
UserAgent: 用户代理,简称UA,属于heads的一部分,服务器通过UA来判断访问者身份,
-
常见的UA值,使用的时候可以直接复制粘贴,也可以用浏览器访问的时候抓包
-
设置UA可以通过两种方式(add_header方法略)
-
# -*- coding:utf-8 -*-
'''
UA
访问一个网站
更改自己的UA进行伪装
'''
from urllib import request, error
import ssl
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
# 使用heads伪装UA
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
context = ssl._create_unverified_context()
req = request.Request(url, headers=headers)
# 正常访问
rsp = request.urlopen(req, context=context)
html = rsp.read().decode()
print(html)
except error.HTTPError as e:
print("HttpError: {0}".format(e.reason))
print("HttpError: {0}".format(e))
except error.URLError as e:
print("URLError: {0}".format(e.reason))
print("URLError: {0}".format(e))
except Exception as e:
print(e)
cookie & session
- 由于http协议的无记忆性,人们为了弥补这一缺陷,所采用的一个补充协议
- cookie是发放给用户(即http浏览器)的一段信息,session是保存在服务器上的对应另一半信息,用来记录用户信息
- 区别:
- 存在位置不同
- cookie不安全
- session会保存在服务器上一定时间
- 单个cookie保存数据不超过4k,很多浏览器限制一个站点最多保存20个
- session存放位置:
- 存在服务器端,
- 一般情况下,session是放在内存中或者数据库中
- -
- 使用cookie登陆
- 直接把cookie复制下来,然后手动放到请求头中。
# -*- coding:utf-8 -*-
from urllib import request,error
import ssl
if __name__ == '__main__':
url = "https://i.zhaopin.com/"
headers = {'Cookie': 'adfbid2=0; ZP_OLD_FLAG=false; dywem=95841923.y; sts_deviceid=164a34417a32a8-0782eddc66b101-163e6953-1296000-164a34417a41b6; LastCity=%E5%8C%97%E4%BA%AC; LastCity%5Fid=530; urlfrom=121113803; urlfrom2=121113803; adfbid=0; adfcid=pzzhubiaoti1; adfcid2=pzzhubiaoti1; sts_sg=1; sts_sid=164add7f4d6227-0bbdd46432f0a3-163e6953-1296000-164add7f4d99f1; zp_src_url=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3D%25E6%2599%25BA%25E8%2581%2594%25E6%258B%259B%25E8%2581%2598%26rsv_pq%3Dfd66295400008b9b%26rsv_t%3D0a0dcoE6eJdRRtQ3X4yL2yB1Fo9fJwspBfabyoQayEJB%252BTFDp4%252BTuMY9c5I%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D6%26rsv_sug1%3D5%26rsv_sug7%3D101%26rsv_sug2%3D1%26prefixsug%3Dzhil%26rsp%3D1%26inputT%3D2099%26rsv_sug4%3D3034%26rsv_sug%3D1; dywea=95841923.2228216168700154400.1531747423.1531747423.1531924969.2; dywec=95841923; dywez=95841923.1531924969.2.3.dywecsr=other|dyweccn=121113803|dywecmd=cnt|dywectr=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1531747423,1531924969; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1531924969; __xsptplus30=30.2.1531924968.1531924968.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23ddrHxvgyiAUdXt_wBRKZ6tmgv5avrAph%23; __utma=269921210.1850005880.1531747424.1531747583.1531924969.3; __utmc=269921210; __utmz=269921210.1531924969.3.3.utmcsr=other|utmccn=121113803|utmcmd=cnt|utmctr=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98; _jzqa=1.2794138475449971700.1531747424.1531747424.1531924969.2; _jzqc=1; _jzqy=1.1531747424.1531924969.2.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98%E5%AE%98%E6%96%B9%E7%BD%91.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98; _jzqckmp=1; lastchannelurl=https%3A//passport.zhaopin.com/login; qrcodekey=9e799d8473234c8abd9330bc378dbfeb; firstchannelurl=https%3A//passport.zhaopin.com/login%3Fy7bRbP%3DdpmxkhDS7MDS7MDSScnq5F8g4_qtZ_R_vUYgLXg8.Ml; __utmt=1; _jzqb=1.3.10.1531924969.1; JsNewlogin=1875023141; JSloginnamecookie=18232533068; JSShowname=%E7%8E%8B%E6%B4%AA%E6%9D%B0; at=6245887dc42b49f797c1e07ccfdc31c1; Token=6245887dc42b49f797c1e07ccfdc31c1; rt=6b3947f9686944baa5419419a1485913; JSsUserInfo=36672168546b5d754377507147654771526350655969586b4e713b653f77587740655b6757685a6b5975447756714465477153635f6553693e6b3b714a65fa044f1a010258672068246b56751b7710711b650e7112630f6502692b6b4471416542774b7712650c670868526b38752577587146654c712463336555695d6b5871446551775777426559675168526b2a753d77587147654c71306326655569216b3871456544775577416554675268586b587544775e7122652371586356655369386b3c714a6541775e77246533672b68546b5a754977577144654571506354655b695b6b43714f654a771; uiioit=3d753d6a44640f38596d5b620635546842795d7955390b6b566e203671645575496a42649; ZP-ENV-FLAG=gray; __utmb=269921210.4.10.1531924969; dyweb=95841923.4.10.1531924969; Hm_lvt_363368edd8b243d3ad4afde198719c4a=1531747506,1531926255; Hm_lpvt_363368edd8b243d3ad4afde198719c4a=1531926255; ZL_REPORT_GLOBAL={%22//i%22:{%22actionIdFromI%22:%22c11070c9-e8a7-42bb-b1e1-f10b433baa17-i%22}}; sts_evtseq=8; stayTimeCookie=1531926255173; referrerUrl=https%3A//i.zhaopin.com/'}
req = request.Request(url, headers=headers)
context = ssl._create_unverified_context()
rsp = request.urlopen(req, context=context)
html = rsp.read().decode()
print(html)
- http模块包含一些关于cookie的模块,通过他们我们可以自动使用cookie。
- CookieJar
- 管理存储cookie,向传出的http请求添加cookie
- cookie存储在内存中,CookieJar实例回收后cookie将消失
- FileCookieJar(filename,delayload=None,policy=None):
- 使用文件管理cookie
- filename是保存cookie的文件
- MozillaCookieJar(filename,delayload=None,policy=None):
- 创建与mocilla浏览器cookie.txt兼容的FileCookieJar实例
- LwpCookieJar(filename,delayload=None,policy=None):
- 创建与libwww-perl标准兼容的Set-Cookie3格式的FileCookieJar实例
- 他们的关系是:CookieJar –> FileCookieJar –> MozillaCookieJar & LwpCookieJar
- 利用cookieJar 访问csdn:
- 自动使用cookie登陆,流程
- 打开登陆页面后自动通过用户名密码登陆
- 自动提取反馈回来的cookie
- 利用提取的cookie登陆隐私页面
- 自动使用cookie登陆,流程
- 利用cookieJar 访问csdn:
# -*- coding:utf-8 -*-
from urllib import request, error, parse
from http import cookiejar
import ssl
# 创建cookieJar实例
cookie = cookiejar.CookieJar()
# 生成 cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
# 创建http请求管理器
http_handler = request.HTTPHandler()
# 生成https管理器
https_handler = request.HTTPSHandler()
# 创建请求管理器
opener = request.build_opener(http_handler, https_handler, cookie_handler)
ssl._create_default_https_context = ssl._create_unverified_context
def login():
"""
负责初次登陆
需要输入用户名密码,用来获取登录cookie凭证
:return:
"""
print("----")
url = "https://passport.csdn.net/account/verify"
# 模拟请求参数
data = {
'username': '18232533068',
'password': '125846Whj1993'
}
# data 进行编码
data = parse.urlencode(data).encode()
# 创建一个请求对象
req = request.Request(url=url, data=data)
# 使用opener发起请求
rsp = opener.open(req)
html = rsp.read().decode()
with open("login.html", 'w') as f:
f.write(html)
def getHomePage():
url = "https://my.csdn.net/"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
req = request.Request(url=url, headers=headers)
# 如果已经执行了login函数,则opener自动已经包含相应的cookie
rsp = opener.open(req)
html = rsp.read().decode()
print(html)
with open("rsp.html", 'w') as f:
f.write(html)
if __name__ == '__main__':
login()
getHomePage()
ProxyHandler处理(代理服务器)
- 使用代理IP,是爬虫的常用手段。
- 获取代理服务器的地址:
- www.xicidaili.com
- www.goubanjia.com
- 代理用来隐藏真实访问中,代理也不允许频繁访问某一个固定网站,所以,代理一定要很多很多
- 基本使用代理步骤:
- 设置代理地址
- 创建ProxyHandler
- 创建Opener
- 安装Opener
- 案例v08
# -*- coding:utf-8 -*-
'''
使用代理访问百度
'''
from urllib import request,error
import ssl
if __name__ == '__main__':
url = "http://www.baidu.com/"
# 设置代理地址
proxy = {'http': '120.194.18.90:81'}
# 创建ProxyHandler
proxy_handler = request.ProxyHandler(proxy)
# 创建Opener
opener = request.build_opener(proxy_handler)
# 安装Opener
request.install_opener(opener)
context = ssl._create_unverified_context()
try:
rsp = request.urlopen(url, context=context)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print("URLError {0}".format(e.reason))
print("URLError {0}".format(e))
except error.HTTPError as e:
print("HTTPError {0}".format(e.reason))
print("HTTPError {0}".format(e))
except Exception as e:
print(e)
更多推荐
所有评论(0)