实时抓取抖音直播的来客和评论

hello，大家好，我是一名测试开发工程师，至今已在自动化测试领域深耕9个年头，现已将本人实战多年的多终端自动化测试框架【wyTest】开源啦，在接下来的一个月里，我将坚持通过直播方式指导大家体验wyTest，请大家关注我。

声明：该脚本仅用于个人练习手机端的自动化测试技术，并无其他用处，也建议大家不要用于它用！！！

今日我突发奇想，决定将抖音直播频道作为我练习编写自动化测试的舞台。我构思的方案是这样的：每当有新的观众进入直播间，便能实时捕捉并立刻播送欢迎的话语；同时，若观众在评论区留下数字，系统便能即刻识别并读出那些数字。

起初，我选择了第一方案：通过浏览器访问抖音网站，进入我的直播频道，然后尝试实时捕捉观众的进入和评论数据，并按照预设的规则进行编码。由于网页端的自动化脚本编写相对简单，这成了我的首选方案。

以下是代码部分（元素xpath已被篡改，所以代码仅供学习参考用）：

######################
# 涉及 语音pyttsx3、浏览器操作selenium、多进程threading
######################
import re
import pyttsx3
import threading
from wyTest_testAuto.library.webFun import webFun
from selenium.webdriver.remote.webdriver import By
from selenium.webdriver.common.keys import Keys

# pyttsx3 可以将文本转换为语音
engine = pyttsx3.init()

# 设置语速（词每分钟，范围一般在80到500之间）
engine.setProperty('rate', 250)

def say_txt(txt):
    engine.say(txt)
    engine.runAndWait()
    print(">>>>>>>>>>>>")

webObj = webFun("weiyv")
# webObj.page_open("https://live.douyin.com/32372570424")

fangkes = []
pingluns = []

def find_shuju():
    si = 1
    xpath = f"(//*[@class='webcast-xxx___item webcast-xxx___enter-done'])[{si}]"
    last_txt = ""
    while True:
        try:
            txt = webObj.myPage.find_element(
                By.XPATH, "//*[@class='webcast-xxx___bottom-message']"
            ).text.strip()
            if txt != last_txt:
                last_txt = txt
                fangkes.append(last_txt)
                # deal_txt(txt)
        except:
            pass
        try:
            txt = webObj.myPage.find_element(
                By.XPATH, xpath
            ).text.strip()
            if txt:
                pingluns.append(txt)
                # deal_txt(txt)
            si += 1
            xpath = f"(//*[@class='webcast-xxx___item webcast-chatroom___enter-done'])[{si}]"
        except:
            pass


def deal_txt():
    while True:
        if fangkes:
            txt = fangkes.pop(0)
        elif pingluns:
            txt = pingluns.pop(0)
        else:
            continue
        print(txt)
        if txt[-3:] == " 来了":
            say_txt(f"热烈欢迎 {txt[:-3]}")
            webObj.myPage.find_element(
                By.XPATH, "//*[@id='xxx-textarea']"
            ).send_keys(txt[:-8]).send_keys(Keys.ENTER)
        else:
            xlist = re.compile(r"\d+").findall(txt)
            if xlist:
                t = "".join(xlist)
                say_txt(f"数字跟读 {t}")

thread1 = threading.Thread(target=find_shuju)
thread2 = threading.Thread(target=deal_txt)
thread1.start()
thread2.start()

然而，实践中的挑战远超预期。随着时间的推移，浏览器页面逐渐变得卡顿，数据的抓取也变得不稳定，手动点击也无法得到响应。我检查了进程占用情况，惊讶地发现该浏览器的内存占用竟高达5G。我猜测这可能是浏览器缓存数据过多导致的，于是这一方案最终以失败告终。

紧接着，我转向了第二方案：利用手机打开抖音APP，并进入直播频道。以下是代码部分（元素xpath已被篡改，所以代码仅供学习参考用）：

############################
涉及：语音pyttsx3、移动端操作appium、多进程threading、正则表达式re
############################

import re
import time
import pyttsx3
import os
import threading
from selenium.webdriver.remote.webdriver import By
from wyTest_testAuto.library.base.appium.appiumConnect import AppiumConnect


desired_caps = {
    'newCommandTimeout': 600,
    'noReset': True,
    'noSign': True,
    'unicodeKeyboard': True,
    'resetKeyboard': True,
    'autoGrantPermissions': True,
    'automationName': 'uiautomator2',
    'platformName': 'Android',
    'platformVersion': '12',
    'deviceName': 'heisha',
    'udid': 'f0b8d534',
    'skipServerInstallation': True,
    'skipDeviceInitialization': True
}
myApp = AppiumConnect(desired_capabilities=desired_caps)

say_list = []


def find_shuju():
    # for i in range(5):
    txtList_last = []
    while True:
        ccs = myApp.find_elements(By.XPATH, "//*[@resource-id='com.ss.android.ugc.xxxx:id/text']")
        txtList = []
        for cc in ccs:
            try:
                txtList.append(cc.text.strip())
            except:
                break

        flag = False
        for txt in txtList:
            if (txt not in txtList_last) or flag:
                flag = True
                txtList_last.append(txt)
                say_list.append(txt)
        if len(txtList_last) > 30:
            txtList_last = txtList_last[-30:]


def deal_txt():
    engine = pyttsx3.init()
    # 设置语速（词每分钟，范围一般在80到500之间）
    engine.setProperty('rate', 250)
    def say_txt(txt):
        engine.say(txt)
        engine.runAndWait()
        return None

    while True:
        if say_list:
            txt = say_list.pop(0)
        else:
            continue
        if txt[-3:] == " 来了":
            print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>")
            print(f"热烈欢迎 {txt[:-3]}")
            say_txt(f"热烈欢迎 {txt[:-3]}")
            say_txt("点点关注，你我相连，抖音之路，不再孤单。")
        else:
            xlist = re.compile(r"\d+").findall(txt)
            if xlist:
                t = "".join(xlist)
                print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>")
                print(t)
                say_txt(f"数字跟读 {t}")


thread1 = threading.Thread(target=find_shuju)
thread2 = threading.Thread(target=deal_txt)
thread1.start()
thread2.start()

尽管这一方案在实施过程中充满了曲折——我需要重新熟悉并部署Appium的环境，而且我的测试手机也有些陈旧——但我最终成功地实现了所需的功能。

不过，这整个过程也让我的手机异常发热，我只好借助电风扇为它降温。在成功实现第二方案后，我深感其中的不易，但也收获了宝贵的经验。尽管过程中遇到了许多困难，比如环境配置的繁琐、手机性能的限制，甚至是直播间热度带来的手机过热问题，但这些挑战都让我更加深刻地理解了自动化测试的实际操作和应用。

实时抓取抖音直播的来客和评论

悦读