仅作练习,侵权联系删除
取cookie
使用自动化工具打开浏览器,手动登录账号获取cookie,只有登录账号,才会出现下面这个界面
没账号的可使用代码中的cookie,control.py文件中有,但是使用这个cookie,在爬取的过程中会遇到反爬滑块验证码,就没办法跳过了,只能等待一段时间,时间可能很长,也可能很短
取cookie的代码
from DrissionPage import ChromiumPage
def get_ck():
page = ChromiumPage()
page.get("https://www.aliexpress.us/w/wholesale-men's-bottoms.html?g=y&SearchText=men%27s+bottoms")
time.sleep(10)
cookies = page.cookies()
# print('cookies', cookies)
cookies = ', '.join([f"\'{cookie['name']}\':\'{cookie['value']}\'" for cookie in cookies])
cookies = json.dumps(cookies)
# 使用正则表达式提取键值对
pattern = re.compile(r"'([^']+)':'([^']+)'")
matches = pattern.findall(cookies)
# 转换为字典
cookies = {key: value for key, value in matches}
print('cookies', cookies)
return cookies
代码
文件位置,目录名随便取,文件名不要修改
control.py文件
import json
import random
import re
import time
import requests
from ui import Win
from DrissionPage import WebPage
# 导入 ChromiumOptions
from DrissionPage import ChromiumPage, ChromiumOptions
class Controller:
ui: Win
attempt_count: int
def __init__(self):
self.attempt_count = 0
self.ui = None
def init(self, ui):
"""
得到UI实例,对组件进行初始化配置
"""
self.ui = ui
def set_helper(self, helper):
self.helper = helper
def sousuo(self, evt):
keyword_list = self.ui.zairu(evt) # 调用 WinGUI 的 zairu 方法获取 key 和 page
results = []
for key, page in keyword_list:
sort_type = self.ui.sort_var.get() # 获取当前选中的排序方式
if sort_type == "Price":
sort_type = "price_asc" # 设置价格排序方式为 price_asc
json_data = {
'pageVersion': 'ff8ad60b0a0d1fbfc9e484ea303a7f44',
'target': 'root',
'data': {
'page': page,
'g': 'y',
'SearchText': key,
'origin': 'y',
'sortType': sort_type,
},
'eventName': 'onChange',
'dependency': [],
}
# 如果 sort_type 为空字符串,删除 'sortType' 键
if not sort_type:
del json_data['data']['sortType']
results.append((key, page, sort_type))
return results
def get_ck(self):
page = ChromiumPage()
page.get("https://www.aliexpress.us/w/wholesale-men's-bottoms.html?g=y&SearchText=men%27s+bottoms")
time.sleep(10)
cookies = page.cookies()
# print('cookies', cookies)
cookies = ', '.join([f"\'{cookie['name']}\':\'{cookie['value']}\'" for cookie in cookies])
cookies = json.dumps(cookies)
# 使用正则表达式提取键值对
pattern = re.compile(r"'([^']+)':'([^']+)'")
matches = pattern.findall(cookies)
# 转换为字典
cookies = {key: value for key, value in matches}
# print('cookies', cookies)
return cookies
def spider(self, key, page, sort_type):
max_retries = 4
cookies = {
'x_router_us_f': 'x_alimid=4461714644',
'aep_common_f': 'zcGjNzurUgUi6yYlXR+iec/vnzNtLs2USBbQKY5+GsBQjbleT4aaDA==',
'sgcookie': 'E100JOobqgkZzOW8GXWNSSW5jkJy3iVicMmlZrz9AbD/LQqBYemNS2jkMDkxQQA350fsZkFs43srs1RABga5/TomSikGZg8DJ4CMLq3HO0ytm4Y=',
'_ga_save': 'yes',
'xman_f': '07OW3BVrp1ynLf27WfaJGW/AJbwIB0AvOY1gRBG1fmvTEav1rLYIeINciLdvHMJ9EjL+CbZ8bFabo52hXFG8ykzYjt8S7gYcuHgvqJyWT1wOMuSAD9/PXWiPg8iap7RuyLVa33hQNkndGAtxhtGz3gvCirQyNoMn2FRhf3+lBvzb7KiG7/NPty1gki/B+BcxuBak9/HoMq6jUgcGI3XxxX2b6mY5kO6SN6ztbFJ3txwSG5CWzPzyF46AXsLWTpun+U+sbfMEH1eAYfnNZWXW0NvPKkJS/zJBJtfsREud5iEBEbZl3GakXnsQzVZmhQQDQ5iIefNXK3fIi3yrZuknZdllAtd+o/YU7IlsrWjGteDBYjbubOBtAesuJDI6TJx+BdcWlleJARfdYTKkbxUSvDNLolZoPJCD',
'intl_locale': 'en_US',
'xman_us_f': 'zero_order=y&x_locale=en_US&x_l=1&x_user=US|%E8%B6%85%E7%BE%A4|%E5%B0%81|ifm|4461714644&x_lid=us2852395651hyoae&x_c_chg=1&x_c_synced=1&acs_rt=ab8ebed08c364f1489437cb9e388c9cc',
'lwrid': 'AgGQTugqkVRERwJSKOU1X39uI%2BYr',
'join_status': '',
'cna': 'pYcBH6Vjk0QCAXs2Jk+PVL1k',
'_m_h5_tk': '0272d0a0e09655e625834ef0b14e477b_1719313187897',
'_m_h5_tk_enc': 'eed5f98d9e82c9051b527ab47afff52b',
'xlly_s': '1',
'aep_usuc_f': 'site=usa&c_tp=USD&x_alimid=4461714644&isb=y®ion=CN&b_locale=en_US&ae_u_p_s=2',
'_gcl_au': '1.1.539085097.1719310761',
'__rtbh.uid': '%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%224461714644%22%7D',
'__rtbh.lid': '%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22KwFEhAA8v4C0dV4ALCAj%22%7D',
'_gid': 'GA1.2.975831037.1719310763',
'AB_DATA_TRACK': '472051_617385',
'AB_ALG': '',
'AB_STG': 'st_StrategyExp_1694492533501%23stg_687',
'acs_usuc_t': 'x_csrf=1dkarwk2f6idi&acs_rt=bf1a0eea9160434ea34f03335f9fe3b6',
'xman_t': 'mbXMVSuL3dwW2iFYa9jRIEH9u8a/n/rcAnR4cZwZA0QvqEAJanb2rHQWyR2Roc9H',
'_ga': 'GA1.1.953021961.1719309733',
'cto_bundle': 'fGeoxF81QVhUeSUyQjhRNVVYd1NLalpMMjFvQ29iSGhLRmNYNWVoJTJCTm5sSDVjc2tySFVVRyUyQnZ2U2l5MDBOTXVWVnRvRU5DTmFVNXhsRnpWR2lvaXQlMkZMM0dzenNHWUc2THlwRE9XVm93U05UJTJCOEUxUnNGelRyU0JhWHhidm5jRFJDdSUyQkt0MDlkJTJCWTF1ZzhpTGpmUSUyRjRLVEFDcWlRJTNEJTNE',
'_ga_VED1YSGNC7': 'GS1.1.1719310762.1.1.1719311333.60.0.0',
'intl_common_forever': 'be0FkNSRlWq6nlPZ3SfmPswlkTGYcN7ox65ux2ySf3t9A/5Pxvy+ZQ==',
'tfstk': 'fvPZvSsjL1CNTGemYDGq4xe7z1ctabIWoSijor4m5cmM1Ct0LoZ35ROjGxA4orHDCcGMmkzujcGfWcGVuuieXca1W--UoD7xfmib0SzmAoOb5oTTpuak156tGIltMjj5V_1WWPhxizltf2K9tqu5GdviIXHseQI5V_1QA9lzTgwXhaoZ8qnnSqAiIwknP43mix0g-wumlhvDso4h8qnnoV0mIXYnPqmDnb88foxmXAboPQFCPOWkK0ziYQzL9VXxb_npi5NmWAPW6DdmRWuZQ04g6NivX2VQTYMBlMG3yJZi8b5w2Vy3zf04Mavq0xPtTDrPZI3LIlPn3W_AayhZuYcmLEA08XmmUWhlZHuLKPw06l82iVF_Nxo-LZAxhXVSE8qwkaUnsqqxeSsJ6c40y7eSa1--7J4rTgSWD2bo1SewmKkiJ2o58wY4qKzrY-mIsKpxLYuEVaXwHKHiJ2o58wJvHvAq80_lQ',
'isg': 'BMjIoCp2OpEYJ1YVq8y8af_ymTbacSx7EEbJtYJ_nsFtXWrHKIPyCREb1TUt6-RT',
'epssw': '4*mmSgRmk9GPqPCODma5nHju00mmmjeFwNZ6gPjoi0PNCwIN5wr0Z6kQ0oa5xnWowGrn_emDDREUYtLpFzz1BmmK3RD4n4KNpndGZeb3pL-WYfYkdaLAgYiW27dGsmkVZRd5Q2Cmc_xnid9_5YM8hmkbwiLgiDFmD4PDV1FmFr7niDUb-jugrni8WmmKrra3NW8g1um0vOLPPmd8N0KYOlSmJMgg9DUTw1BqY6DkPe5JwXlJ26Sy7RKeOvGvbz0ZD85N2FYNsf_wKtn-oI2gtCzKoDDj0SFDObMK-_em..',
}
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'bx-v': '2.5.11',
'content-type': 'application/json;charset=UTF-8',
'origin': 'https://www.aliexpress.us',
'priority': 'u=1, i',
# =============referer修改
'referer': "https://www.aliexpress.us/w/wholesale-men's-bottoms.html?g=y&SearchText=men%27s+bottoms",
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}
for i in range(1, int(page) + 1):
json_data = {
'pageVersion': 'ff8ad60b0a0d1fbfc9e484ea303a7f44',
'target': 'root',
'data': {
'page': i,
'g': 'y',
'SearchText': str(key),
'origin': 'y',
'sortType': sort_type,
},
'eventName': 'onChange',
'dependency': [],
}
a = 0
for attempt in range(max_retries):
try:
response = requests.post('https://www.aliexpress.us/fn/search-pc/index', json=json_data,
headers=headers,
cookies=cookies, timeout=10)
try:
data = response.json()['data']
if 'result' not in data or 'mods' not in data['result'] or 'itemList' not in data['result'][
'mods']:
raise ValueError("响应数据结构不符合预期")
content_list = response.json()['data']['result']['mods']['itemList']['content']
except (json.decoder.JSONDecodeError, ValueError) as err:
#有反爬,重新取cookie可绕过
print(f"{key}的线程正在重新获取cookie")
cookies = self.get_ck()
a = a + 1
print(a)
if a > 3:
break
else:
continue # 继续下一次重试
for idx, content in enumerate(content_list, start=1):
title = content['title']['seoTitle']
x_object_id = content['trace']['utLogMap']['x_object_id']
store_url = 'https://www.aliexpress.us/item/' + str(x_object_id) + '.html'
price_str = content['prices']['salePrice']['formattedPrice']
price = price_str[3:]
self.ui.insert_data_to_table(title, price, store_url, key)
print(f"正在爬取{key}的第{i}页")
break
except requests.exceptions.RequestException as req_err:
print(req_err)
# print(proxy)
print(f"请求异常,正在重试")
continue
except Exception as e:
print(f"其他异常: {e}")
break # 其他异常,结束爬取
self.ui.update_status(f"{key}已爬取完毕")
print(f"{key}已爬取完毕")
def daochu(self, evt):
print("<Button-1>事件未处理:", evt)
def qingkong(self, evt):
self.ui.clear_all()
def tianjia(self, evt):
self.ui.add_keyword()
main.py文件
# 导入布局文件
import re
import threading
from ui import Win as MainWin
# 导入窗口控制器
from control import Controller as MainUIController
import tkinter as tk
import time
import requests
from openpyxl import Workbook, load_workbook
import random
from ui import WinGUI
from DrissionPage import ChromiumPage, ChromiumOptions
class ScrapeThread(threading.Thread):
def __init__(self, controller, key, page, sort_type, counter, lock):
super().__init__()
self.controller = controller
self.key = key
self.page = page
self.sort_type = sort_type
self.counter = counter
self.lock = lock
def run(self):
try:
self.controller.spider(self.key, self.page, self.sort_type)
finally:
with self.lock:
self.counter[0] -= 1
if self.counter[0] == 0:
self.controller.ui.update_status("已爬取完毕")
def on_button_click(event):
# 创建并启动新线程来处理爬虫任务
threading.Thread(target=start_scraping, args=(event,)).start()
# WinGUI.update_status("已爬取完毕",'kdajfkl')
def start_scraping(event):
keyword_list = controller.sousuo(event)
thread_counter = [len(keyword_list)]
lock = threading.Lock()
for key, page, sort_type in keyword_list:
# 为每个关键字创建一个新线程
thread = ScrapeThread(controller, key, page, sort_type, thread_counter, lock)
thread.start()
if __name__ == "__main__":
page = ChromiumPage()
page.get("https://www.aliexpress.us/w/wholesale-men's-bottoms.html?g=y&SearchText=men%27s+bottoms")
# time.sleep(10)
# gui = WinGUI() # 创建 WinGUI 实例
controller = MainUIController()
app = MainWin(controller)
app.set_controller(controller) # 设置控制器
controller.init(app)
app.tk_button_search.bind('<Button-1>', on_button_click)
app.mainloop()
ui.py文件
绘制gui界面
from tkinter import *
from tkinter import filedialog
from tkinter.ttk import *
import openpyxl
from xlwt import Workbook
from openpyxl import Workbook, load_workbook
class WinGUI(Tk):
def __init__(self):
super().__init__()
self.controller = None # 控制器属性初始化为 None
# 设置默认排序方式
self.sort_var = StringVar()
self.sort_var.set("total_tranpro_desc") # 设置默认选项为Orders
# 初始化界面组件
self.__init_ui_components()
# 设置主窗口属性
self.__win()
def __win(self):
self.title("速卖通数据抓取")
width = 889
height = 506
screenwidth = self.winfo_screenwidth()
screenheight = self.winfo_screenheight()
geometry = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)
self.geometry(geometry)
self.resizable(width=False, height=False)
def __init_ui_components(self):
# 创建所有的界面组件
self.tk_table_数据展示 = self.__tk_table_数据展示(self)
self.tk_input_key = self.__tk_input_key(self)
self.tk_button_search = self.__tk_button_search(self)
self.tk_text_zhuangtai = self.__tk_text_zhuangtai(self)
self.tk_label_关键字标签 = self.__tk_label_关键字标签(self)
self.tk_label_状态 = self.__tk_label_状态(self)
self.tk_input_page = self.__tk_input_page(self)
self.tk_label_页面数标签 = self.__tk_label_页面数标签(self)
self.tk_label_优先级标签 = self.__tk_label_优先级标签(self)
self.tk_radio_button_BestMatch, self.tk_radio_button_Orders, self.tk_radio_button_Price = self.__tk_radio_button_BestMatch(self)
self.tk_button_导出文件 = self.__tk_button_导出文件(self)
self.tk_button_清空 = self.__tk_button_清空(self)
self.tk_text_导出文件的状态 = self.__tk_text_导出文件的状态(self)
self.tk_button_导入文件 = self.__tk_button_导入文件(self)
self.tk_button_tianjia = self.__tk_button_tianjia(self)
self.tk_table_关键词列表 = self.__tk_table_关键词列表(self)
self.current_row = 1 # 初始行数
# 清空表格和状态文本框
self.clear_table()
self.clear_status()
# 清空表格
def clear_table(self):
self.tk_table_数据展示.delete(*self.tk_table_数据展示.get_children())
self.current_row = 1 # 重置行数
# 清空关键词列表
def clear_table1(self):
self.tk_table_关键词列表.delete(*self.tk_table_关键词列表.get_children())
self.current_row = 1 # 重置行数
# 清空状态文本框
def clear_status(self):
self.tk_text_zhuangtai.delete(1.0, END)
def clear_input_key(self):
# 清空关键字输入框
self.tk_input_key.delete(0, END)
def clear_input_page(self):
# 清空页面数输入框
self.tk_input_page.delete(0, END)
def clear_export_status(self):
# 清空导出文件状态文本框
self.tk_text_导出文件的状态.delete(1.0, END)
# 添加清空方法
def clear_all(self):
self.clear_table()
self.clear_status()
self.clear_input_key()
self.clear_input_page()
self.clear_export_status()
self.clear_table1()
# 添加控制器
def set_controller(self, controller):
self.controller = controller
def __tk_radio_button_BestMatch(self, parent):
rb_best_match = Radiobutton(parent, text="Orders", variable=self.sort_var, value="total_tranpro_desc")
rb_best_match.place(x=396, y=361, width=98, height=30)
rb_orders = Radiobutton(parent, text="Best Match", variable=self.sort_var, value="total_tranpro_desc_desc")
rb_orders.place(x=396, y=406, width=98, height=30)
rb_price = Radiobutton(parent, text="Price", variable=self.sort_var, value="price_asc")
rb_price.place(x=397, y=450, width=96, height=30)
return rb_best_match, rb_orders, rb_price
def __tk_table_关键词列表(self, parent):
columns = {"ID": 55, "关键词": 180, "页数":70}
tk_table = Treeview(parent, show="headings", columns=list(columns))
for text, width in columns.items():
tk_table.heading(text, text=text, anchor='center')
tk_table.column(text, anchor='center', width=width, stretch=False)
tk_table.place(x=50, y=400, width=310, height=100)
return tk_table
def __tk_table_数据展示(self, parent):
columns = {"ID": 70, "标题": 150, "价格": 70, "链接": 450,"关键词":107}
tk_table = Treeview(parent, show="headings", columns=list(columns))
for text, width in columns.items():
tk_table.heading(text, text=text, anchor='center')
tk_table.column(text, anchor='center', width=width, stretch=False)
tk_table.place(x=20, y=70, width=850, height=226)
return tk_table
def __tk_input_key(self, parent):
ipt = Entry(parent)
ipt.place(x=125, y=360, width=119, height=30)
return ipt
def __tk_text_zhuangtai(self, parent):
text = Text(parent)
text.place(x=606, y=361, width=155, height=31)
text.tag_configure("center", justify='center') # 创建一个居中的标签
text.insert("1.0", "Your centered text here", "center") # 插入居中文本,1.0 表示插入位置
return text
def __tk_label_关键字标签(self, parent):
label = Label(parent, text="关键字", anchor="center")
label.place(x=156, y=326, width=50, height=30)
return label
def __tk_label_状态(self, parent):
label = Label(parent, text="状态", anchor="center")
label.place(x=663, y=327, width=50, height=30)
return label
def __tk_input_page(self, parent):
ipt = Entry(parent)
ipt.place(x=267, y=360, width=90, height=30)
return ipt
def __tk_label_页面数标签(self, parent):
label = Label(parent, text="页面数(1-60)", anchor="center")
label.place(x=267, y=326, width=100, height=30)
return label
def __tk_label_优先级标签(self, parent):
label = Label(parent, text="优先级", anchor="center")
label.place(x=421, y=326, width=50, height=30)
return label
def __tk_button_导出文件(self, parent):
btn = Button(parent, text="导出文件", takefocus=False)
btn.place(x=650, y=416, width=71, height=30)
return btn
def __tk_button_导入文件(self, parent):
btn = Button(parent, text="导入文件", takefocus=False)
btn.place(x=50, y=360, width=71, height=30)
return btn
def __tk_button_tianjia(self, parent):
btn = Button(parent, text="添加", takefocus=False)
btn.place(x=527, y=361, width=50, height=30)
return btn
def __tk_button_search(self, parent):
btn = Button(parent, text="搜索", takefocus=False)
btn.place(x=527, y=407, width=50, height=30)
return btn
def __tk_button_清空(self, parent):
btn = Button(parent, text="清空", takefocus=False)
btn.place(x=527, y=457, width=50, height=30)
return btn
def __tk_text_导出文件的状态(self, parent):
text = Text(parent)
text.place(x=606, y=451, width=155, height=30)
return text
def insert_data_to_table(self, title, price, store_url,key):
id = self.current_row
self.tk_table_数据展示.insert('', 'end', values=[id, title, price, store_url,key])
self.current_row += 1
def update_status(self, status_text):
self.tk_text_zhuangtai.delete(1.0, END) # 清空现有文本
self.tk_text_zhuangtai.insert(END, status_text) # 插入新的状态文本
def export_to_excel(self, event):
file_path = filedialog.asksaveasfilename(
defaultextension=".xlsm",
filetypes=[
("Excel Macro-Enabled Workbook", "*.xlsm"),
("Excel Workbook", "*.xlsx")
]
)
if file_path:
try:
wb = Workbook()
ws = wb.active
ws.title = "数据导出" # 设置工作表名称
# 写入表头
ws.append(["ID", "标题", "价格", "链接","关键字"])
# 写入表格数据
for i, item in enumerate(self.tk_table_数据展示.get_children(), start=1):
values = self.tk_table_数据展示.item(item, "values")
ws.append([i, values[1], values[2], values[3],values[4]])
wb.save(file_path)
self.tk_text_导出文件的状态.delete(1.0, END)
self.tk_text_导出文件的状态.insert(END, "导出成功")
except Exception as e:
self.tk_text_导出文件的状态.delete(1.0, END)
self.tk_text_导出文件的状态.insert(END, f"导出失败:{str(e)}")
else:
self.tk_text_导出文件的状态.delete(1.0, END)
self.tk_text_导出文件的状态.insert(END, "未选择文件路径")
def daoru(self,event):
file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xlsm")])
if file_path:
try:
wb = load_workbook(filename=file_path)
sheet = wb.active
self.clear_table1()
for row in sheet.iter_rows(min_row=2, values_only=True):
keyword, page = row
self.tk_table_关键词列表.insert('', 'end', values=(self.current_row, keyword, page))
self.current_row += 1
self.tk_text_导出文件的状态.insert(END, "文件导入成功!\n")
except Exception as e:
self.tk_text_导出文件的状态.insert(END, f"文件导入失败:{e}\n")
class Win(WinGUI):
def __init__(self, controller):
super().__init__()
self.set_controller(controller) # 设置控制器
self.__event_bind()
self.__style_config()
def __event_bind(self):
# 绑定事件到控制器方法
self.tk_button_search.bind('<Button-1>', lambda event: self.controller.sousuo(event))
self.tk_button_导出文件.bind('<Button-1>', self.export_to_excel)
self.tk_button_导入文件.bind('<Button-1>', self.daoru)
self.tk_button_清空.bind('<Button-1>', lambda event: self.controller.qingkong(event))
self.tk_button_tianjia.bind('<Button-1>', lambda event: self.controller.tianjia(event))
def __style_config(self):
pass
def add_keyword(self):
key = self.tk_input_key.get()
page = self.tk_input_page.get()
if key and page:
self.tk_table_关键词列表.insert("", "end", values=(self.current_row, key, page))
self.current_row += 1 # 自增行数
self.clear_input_key()
self.clear_input_page()
def update_status_1(self, status_text):
self.tk_text_dengluzhuangtai.delete(1.0, END) # 清空现有文本
self.tk_text_dengluzhuangtai.insert(END, status_text) # 插入新的状态文本
def zairu(self,event):
# 读出表格数据
keyword_list=[]
for i, item in enumerate(self.tk_table_关键词列表.get_children(), start=0):
values = self.tk_table_关键词列表.item(item, "values")
key = values[1]
page = values[2]
keyword_list.append((key, page))
# print(keyword_list)
return keyword_list
if __name__ == "__main__":
# Example usage:
class Controller:
def __init__(self):
pass
def sousuo(self, event):
print("Sousuo event triggered")
def daochu(self, event):
print("Daochu event triggered")
def qingkong(self, event):
print("Qingkong event triggered")
controller = Controller()
app = Win(controller)
app.mainloop()
项目展示
导入文件,点击搜索
导入的是execl表格,表格内容如下
成功搜索后的页面
导出文件,导出的文件execl也是表格,因为多线程,表格有点乱,可以手动筛选排序