++++++++++开始线++++++++++++++++++
文章目录
一、 urlopen
urlopen用于请求数据
通过url打开http://www.baidu.com/
1.1 返回response对象
1.2 response.read()
1.3 bytes.decode(“utf-8”)
url_opne_code.py
import urllib.request
def load_data():
url = "http://www.baidu.com/"
# get的请求
# http请求
# response:http相应的对象
response = urllib.request.urlopen(url)
print(response)
# 读取内容 bytes类型
data = response.read()
print(data)
# 将文件获取的内容转换成字符串
str_data = data.decode("utf-8")
print(str_data)
# 将数据写入文件
with open("baidu.html", "w", encoding="utf-8")as f:
f.write(str_data)
# 将字符串类型转换成bytes
str_name = "baidu"
bytes_name = str_name.encode("utf-8")
print(bytes_name)
# python爬取的类型:str bytes
# 如果爬取回来的是bytes类型:但是你写入的时候需要字符串 decode("utf-8")
# 如果爬取过来的是str类型:但你要写入的是bytes类型 encode(""utf-8")
load_data()
二、 get请求+参数
2.1 汉字报错
解释器asci没有汉字url汉字转码
=====================================
urllib.parse.quote
safe=“string.printtable”
2.2 字典传参
urllib.parse.urlencode()
=====================================
get-params.py
import urllib.request
import urllib.parse
import string
def get_method_params():
url = "http://www.baidu.com/s?wd="
# 拼接字符串(汉字)
# python可以接受的数据
# https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3
name = "美女"
final_url = url+name
print(final_url)
# 代码发送了请求
# 网址里面包含了汉字;ascii是没有汉字的;url转译
# 将包含汉字的网址进行转译
encode_new_url = urllib.parse.quote(final_url, safe=string.printable)
print(encode_new_url)
# 使用代码发送网络请求
response = urllib.request.urlopen(encode_new_url)
print(response)
# 读取内容
data = response.read().decode()
print(data)
# 保存到本地
with open("02-encode.html", "w", encoding="utf-8")as f:
f.write(data)
# UnicodeEncodeError: 'ascii' codec can't encode
# characters in position 10-11: ordinal not in range(128)
# python:是解释性语言;解析器只支持 ascii 0 - 127
# 不支持中文
get_method_params()
=====================================
get-params.py
import urllib.request
import urllib.parse
import string
def get_params():
url = "http://www.baidu.com/s?"
# url+字典(params)
params = {
"wd": "中文",
"key": "xiao",
"value": "ming"
}
str_params = urllib.parse.urlencode(params) # 转译且把:改为=
final_url = url + str_params
# 将带有中文的url转译成计算机可以识别的url
end_url = urllib.parse.quote(final_url, safe=string.printable)
response = urllib.request.urlopen(end_url)
# 拆包
data = response.read().decode("utf-8")
print(data)
get_params()
三、 header请求头
3.1 请求头的初体验
request_header.py
import urllib.request
def load_baidu():
url = "https://www.baidu.com"
# 创建请求对象
request = urllib.request.Request(url)
# 请求网络数据
response = urllib.request.urlopen(request)
print(response)
# 转译
data = response.read().decode("utf-8")
# 获取到完整的url
final_url = request.get_full_url()
print(final_url)
# 响应头
# print(response.headers)
# 获取请求头headers的信息
request_headers = request.headers
print(request_headers)
# 新建打开并写入02header.html
with open("02header.html", "w")as f:
f.write(data)
load_baidu()
# 可以在Run里面打开https://www.baidu.com
3.2 请求头的再体验
reques_header_two
import urllib.request
def load_baidu():
url = "http://www.baidu.com"
# 添加请求头的信息
header = {
# 浏览器的版本
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362 "
}
# 第一种创建请求对象
# request = urllib.request.Request(url, headers=header)
# 第二种创建请求对象,动态添加header的信息
request = urllib.request.Request(url)
request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362 ")
# 请求网络数据
response = urllib.request.urlopen(request)
print(response)
data = response.read().decode("utf-8")
# 获取到完整的url
final_url = request.get_full_url()
print(final_url)
# 打印响应头
# print(response.headers)
# 第一种方式获取请求头的信息(打印所有头的信息)
# request_headers = request.headers
# print(request_headers)
# 第二种方式打印请求头信息
# 引号里面注意首字母大写,其他小写
request_headers = request.get_header("User-agent")
print(request_headers)
load_baidu()
3.3 请求头的终体验
对user_agent_list进行随机取样
random_user_agent.py
import urllib.request
import random
def load_baidu():
url = "http://www.baidu.com"
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50"
]
# 每次请求的浏览器都是不一样的
random_user_agent = random.choice(user_agent_list)
request = urllib.request.Request(url)
# 增加对应的请求头信息(user_agent)
request.add_header("User-Agent", random_user_agent)
# 请求数据
response = urllib.request.urlopen(request)
# 打印请求头的信息
print(request.get_header("User-agent"))
load_baidu()
四、 User-Agent
用途:百度批量搜索,模拟真实的浏览器发送请求(没有会被反爬)
获取User-Agent:检查元素、百度搜索 User-Agent大全
注意:
增加对应的请求头信息request.add_header(动态添加head数据)
响应头response.header
创建request:urlib.request.Request(url)。
五、 IP代理
免费的IP:时效性差,错误率高
付费的IP:贵花钱,也有失效不能用的
IP分类:
透明:对方知道我们真实的ip
匿名:对方不知道我们真实的ip,知道了你使用了代理
高匿:对方不知道我们真是的IP.也不知道我们使用了代理
handler
系统的urlopen不支持添加代理的功能,所以创建处理器(handler)
步骤:
代理处理器ProxyHandler
通过ProxyHandler创建opener:build_opener()
通过opener(url)就可以请求数据
5.1 handler处理器和自定义opener
handler_opener.py
import urllib.request
def handler_opener():
# 系统的urlopen并没有添加代理的功能所以需要我们自定义这个功能
# https:安全 套接层 ssl第三方的CA数字证书
# http80端口
# https443端口
# urlopen为什么可以请求数据:有handler处理器和opener
url = "https://blog.csdn.net/qq_42893334"
# 创建处理器
handler = urllib.request.HTTPHandler()
# 创建opener
opener = urllib.request.build_opener(handler)
# 用创建的opener调用open方法请求数据
response = opener.open(url)
# data = response.read()
data = response.read().decode("utf-8")
print(response)
print(data)
handler_opener()
5.2 添加代理
proxy-handler.py
import urllib.request
def create_proxy_handler():
url = "https://blog.csdn.net/qq_42893334"
# 添加代理
proxy = {
# 免费的写法
"http": "45.115.63.78:55443"
# "http":"120.77.249.46:8080"
# 付费的代理
# "http":"xiaoming":123@115.
}
# 代理处理器
proxy_handler = urllib.request.ProxyHandler(proxy)
# 创建opener
opener = urllib.request.build_opener(proxy_handler)
# 拿着代理ip去发送请求
response = opener.open(url)
print(response)
data = response.read().decode("utf-8")
print(data)
create_proxy_handler()
5.3 随机代理
random-uesr-proxy.py
import urllib.request
def proxy_user():
proxy_list = [ # {"https":""},
{"https": "106.75.226.36:808"},
{"https": "61.135.217.7:80"},
{"https": "125.70.13.77:8080"},
{"https": "118.190.95.35:9001"}
]
for proxy in proxy_list:
print(proxy)
# 利用遍历出来的ip创建处理器
proxy_handler = urllib.request.ProxyHandler(proxy)
# 创建opener
opener = urllib.request.build_opener(proxy_handler)
try:
data = opener.open("http://www.baidu.com", timeout=1)
print("OK")
except Exception as e:
print(e)
proxy_user()
++++++++++结束线++++++++++++++++++