Bootstrap

爬虫的新模块pyppeteer的使用之 禁止加载图片 截获response和request

截获response和request

await page.setRequestInterception(True)
page.on('request', intercept_request)
page.on('response', intercept_response)

intercept_requestintercept_response相当于是注册的两个回调函数,在浏览器发出请求和获取到请求之前指向这两个函数。

比如可以这样禁止获取图片、多媒体资源和发起 websocket 请求:

async def intercept_request(req):
    """请求过滤"""
    if req.resourceType in ['image', 'media', 'eventsource', 'websocket']:
        await req.abort()
    else:
        await req.continue_()

然后每次获取到请求之后将内容打印出来(这里只打印了fetchxhr类型response 的内容):

async def intercept_response(res):
    resourceType = res.request.resourceType
    if resourceType in ['xhr', 'fetch']:
        resp = await res.text()
        print(resp)

 

from scrapy import signals
import pyppeteer
import asyncio
import os
import time
import json
import tkinter
from scrapy.http import HtmlResponse
from Aliexpress.ConfigDB import RedisDB,RedisPool
import logging
pyppeteer_level = logging.WARNING
logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
pyppeteer_logger = logging.getLogger('pyppeteer')
pyppeteer_logger.setLevel(logging.WARNING)
# redisconn=RedisDB(db=0)
redisconn=RedisPool(db=0)
pyppeteer.DEBUG = False

def _patch_pyppeteer():
    from typing import Any
    from pyppeteer import connection, launcher
    import websockets.client

    class PatchedConnection(connection.Connection):  # type: ignore
        def __init__(self, *args: Any, **kwargs: Any) -> None:
            super().__init__(*args, **kwargs)
            # the _ws argument is not yet connected, can simply be replaced with another
            # with better defaults.
            self._ws = websockets.client.connect(
                self._url,
                loop=self._loop,
                # the followi
;