python爬虫入门代码
from bs4 import BeautifulSoup
import requests
import time
import re
from lxml import etree
# # 保持会话
# # 新建一个session对象
# sess = requests.session()
# # 添加headers(header为自己登录的企查查网址,输入账号密码登录之后所显示的header,此代码的上方介绍了获取方法)
# afterLogin_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'}
#
# # post请求(代表着登录行为,登录一次即可保存,方便后面执行查询指令)
# login = {'user':'17815335698','password':'www123456www'}
# code = sess.post(url='https://www.qcc.com/', data=login, headers=afterLogin_headers)
# print(code)
url = 'https://www.qcc.com/web/search?key=%E6%AF%94%E4%BA%9A%E8%BF%AA'
headers = {
'authority': 'anchor.qcc.com',
'method':'GET',
'path':r'/',
'scheme':'https',
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cookie':'qcc_did=c8623839-d520-42f5-a348-b9d75877d8af; UM_distinctid=18b1e0cb8d39e2-0347a3ecbe3e8d-78505770-e3658-18b1e0cb8d481f; QCCSESSID=818c3ff6444d3785e65fa0b875; tfstk=fQLvkeDC7YDcp-3YYhikbOt5umcokEdqiKRQSOX0fLpJOBycfiDVB3p6UiXbojSt6L9eiGY9_PC61dlVIjokgI7N5vDHW2A2gF097v8AG4wWa61_GAGrE-bN5vDlDetEaNJnK05A6QGRs1efCdsXN86P6Ow6Cts7F6CF5OG9Uormh8aHce1jYHWFKbL_fsSAdQnUDogVGJX9GTU_5tCADudfePa6y3SY6OBjY7XNotJcMduUuZs9fEbWlYg1RHJJWM_Kjj7BuCtefQyx6tOk3eQXJxUcZGBvVEs81POBGTIJXKHY0ttDeM55V53FZpXWgEt-_8OXKtQfNg0I9Qs14n8HJhvTK9Cgcbh8blSfaRrc69rdqEZCw9conlrNVACRKbh8blSfa_BnahZabgMO.',
'Origin':'https://www.qcc.com',
'Referer':'https://www.qcc.com/',
'Sec-Ch-Ua':'"Chromium";v="122", "Not(A:Brand";v="24", "Microsoft Edge";v="122"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
# Mess = requests.get(url, headers=headers).content.decode()
# time.sleep(0.05)
# try:
# Mess_Num = re.search('"ContactNumber":"(.*?)","Email"', Mess, re.S).groups(1)
# except:
# Mess_Num = []
response = requests.get(url, headers=headers)
# html = response.text
time.sleep(0.05)
html = etree.HTML(response.text)
xpath_link = '//table/tr[@class="tsd0"]/td/div[@class="maininfo"]/div[@class="relate-info"]/div/span[@class="f"]/span/text()'
test1 = html.xpath(xpath_link)
print(test1)
# soup = BeautifulSoup(html, 'html.parser')
# # print(soup)
# all_titles = soup.findAll('tr')[0]
# # print(all_titles)
# for title in all_titles:
# money = title.find('td')
# # money1 = title.find('span')
# print('--------------------------------')
# print(money)