爬取小红书图片
流程
代码
python代码
import requests
import re
import os
# 1. 发送请求
headers = {
'cookie':'acw_tc=0a00d90517519520293856129e6cb5ea858372dd35e69f821d49648d071928; abRequestId=4805765c-5eab-573e-bb1c-d10416ba4bac; webBuild=4.72.0; xsecappid=xhs-pc-web; loadts=1751952030345; a1=197e87a7a8ax2rvhl6h9ilethxlj7rs4vbm2u82bs50000169918; webId=aedd0ea213fda763d49126083b06d063; websectiga=984412fef754c018e472127b8effd174be8a5d51061c991aadd200c69a2801d6; sec_poison_id=dd75d578-e649-4080-9dd5-f26334c524ce; web_session=030037af5ffed41e5f99ecec142f4a719b7905; gid=yjWdYW0jWWhSyjWdYW0W0DYJY0CJFhx1Kxj36DM1d9xC1J28VWFM47888yKjjyY8S0y4f2fd',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
'referer':'https://www.xiaohongshu.com/'
}
search_url = 'https://edith.xiaohongshu.com/api/sns/web/v1/search/notes'
url = 'https://www.xiaohongshu.com/explore/686ba9560000000012020bcb?xsec_token=ABpilsvwFWex2FHPRruyBjWoqbUYpCz9YRErWkVHq_7WI=&xsec_source=pc_user'
response = requests.get(url = url,headers=headers)
# 2.获取数据
html = response.text
# print(html)
#3. 解析数据
# 提取标题
title = re.findall(r'<meta name="og:title" content="(.*?)">',html)[0]
print(title)
# 提取图片链接
img_list = re.findall(r'<meta name="og:image" content="(.*?)">',html)
# 定义序号
num = 1
for img in img_list:
img_content = requests.get(url = img,headers=headers).content
"""保存数据"""
with open(f'img\\{num}.jpg', 'wb') as f:
# 写入数据
f.write(img_content)
num += 1
python代码
import requests
import re
import os
# 导入自动化模块
from DrissionPage import ChromiumPage
# 打开浏览器
dp = ChromiumPage()
# 监听数据包
dp.listen.start('search/notes')
# 访问网站
dp.get('https://www.xiaohongshu.com/search_result/?keyword=%25E5%25A3%2581%25E7%25BA%25B8&source=web_note_detail_r10&type=51')
# 等待数据包加载
r = dp.listen.wait()
# 获取数据内容
json_data = r.response.body
items = json_data['data']['items']
for item in items:
id_ = item['id']
if '-' not in id_:
token = item['xsec_token']
print(id_, token)
headers = {
'cookie': 'acw_tc=0a00d90517519520293856129e6cb5ea858372dd35e69f821d49648d071928; abRequestId=4805765c-5eab-573e-bb1c-d10416ba4bac; webBuild=4.72.0; xsecappid=xhs-pc-web; loadts=1751952030345; a1=197e87a7a8ax2rvhl6h9ilethxlj7rs4vbm2u82bs50000169918; webId=aedd0ea213fda763d49126083b06d063; websectiga=984412fef754c018e472127b8effd174be8a5d51061c991aadd200c69a2801d6; sec_poison_id=dd75d578-e649-4080-9dd5-f26334c524ce; web_session=030037af5ffed41e5f99ecec142f4a719b7905; gid=yjWdYW0jWWhSyjWdYW0W0DYJY0CJFhx1Kxj36DM1d9xC1J28VWFM47888yKjjyY8S0y4f2fd',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
'referer': 'https://www.xiaohongshu.com/'
}
search_url = 'https://edith.xiaohongshu.com/api/sns/web/v1/search/notes'
url = f'https://www.xiaohongshu.com/explore/{id_}?xsec_token={token}&xsec_source=pc_user'
response = requests.get(url=url, headers=headers)
# 2.获取数据
html = response.text
# print(html)
# 3. 解析数据
try:
# 提取标题
old_title = re.findall(r'<meta name="og:title" content="(.*?)">', html)[0]
title = re.sub(r'[\\/:*?"<>|\n\r]','',old_title)
print(title)
# 提取图片链接
img_list = re.findall(r'<meta name="og:image" content="(.*?)">', html)
# 定义序号
num = 1
for img in img_list:
img_content = requests.get(url=img, headers=headers).content
"""保存数据"""
with open(f'img\\{title}-{num}.jpg', 'wb') as f:
# 写入数据
f.write(img_content)
num += 1
except Exception as e:
print(e)