Bootstrap

Playwright——获取页面中所有资源链接

# 导入asyncio库,用于处理异步编程
import asyncio
# 从playwright的异步API中导入async_playwright,用于控制浏览器
from playwright.async_api import async_playwright

# 定义异步主函数,作为程序入口
async def main():
    # 使用async with上下文管理器创建Playwright实例,确保资源正确释放
    async with async_playwright() as p:
        # 启动Chromium浏览器实例,await等待启动完成
        browser = await p.chromium.launch()
        # 在浏览器中创建一个新页面(标签页)
        page = await browser.new_page()
        # 导航到指定URL,等待页面加载完成
        await page.goto('https://qiye.58pic.com/newpic/32460413.html')

        # 在页面上下文中执行JavaScript代码,收集所有资源链接
        all_resources = await page.evaluate('''() => {
            // 选择所有包含src或特定href属性的元素
            const resources = Array.from(document.querySelectorAll(
                '[src], link[rel="stylesheet"], img[src], script[src], ' +
                '[href$=".css"], [href$=".js"], [href$=".json"], [href$=".m3u8"]'
            ));
            // 提取元素的src或href属性值,过滤无效值
            return resources.map(resource => resource.src || resource.href);
        }''')  # 返回结果给Python变量all_resources

        # 遍历所有资源链接并打印
        for resource in all_resources:
            print("资源链接: ", resource)

        # 关闭浏览器,释放资源
        await browser.close()

# 运行异步主函数
asyncio.run(main())
;