流程

代码

+ python代码

import requests
import re
import os

# 1. 发送请求
headers = {
    'cookie':'acw_tc=0a00d90517519520293856129e6cb5ea858372dd35e69f821d49648d071928; abRequestId=4805765c-5eab-573e-bb1c-d10416ba4bac; webBuild=4.72.0; xsecappid=xhs-pc-web; loadts=1751952030345; a1=197e87a7a8ax2rvhl6h9ilethxlj7rs4vbm2u82bs50000169918; webId=aedd0ea213fda763d49126083b06d063; websectiga=984412fef754c018e472127b8effd174be8a5d51061c991aadd200c69a2801d6; sec_poison_id=dd75d578-e649-4080-9dd5-f26334c524ce; web_session=030037af5ffed41e5f99ecec142f4a719b7905; gid=yjWdYW0jWWhSyjWdYW0W0DYJY0CJFhx1Kxj36DM1d9xC1J28VWFM47888yKjjyY8S0y4f2fd',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
    'referer':'https://www.xiaohongshu.com/'
}

search_url = 'https://edith.xiaohongshu.com/api/sns/web/v1/search/notes'

url = 'https://www.xiaohongshu.com/explore/686ba9560000000012020bcb?xsec_token=ABpilsvwFWex2FHPRruyBjWoqbUYpCz9YRErWkVHq_7WI=&xsec_source=pc_user'
response = requests.get(url = url,headers=headers)
# 2.获取数据
html = response.text
# print(html)
#3. 解析数据
# 提取标题
title = re.findall(r'<meta name="og:title" content="(.*?)">',html)[0]
print(title)
# 提取图片链接
img_list = re.findall(r'<meta name="og:image" content="(.*?)">',html)
# 定义序号
num = 1
for img in img_list:
    img_content = requests.get(url = img,headers=headers).content
    """保存数据"""
    with open(f'img\\{num}.jpg', 'wb') as f:
        # 写入数据
        f.write(img_content)
    num += 1

+ python代码

import requests
import re
import os
# 导入自动化模块
from DrissionPage import ChromiumPage
# 打开浏览器
dp = ChromiumPage()
# 监听数据包
dp.listen.start('search/notes')
# 访问网站
dp.get('https://www.xiaohongshu.com/search_result/?keyword=%25E5%25A3%2581%25E7%25BA%25B8&source=web_note_detail_r10&type=51')
# 等待数据包加载
r = dp.listen.wait()
# 获取数据内容
json_data = r.response.body
items = json_data['data']['items']
for item in items:
    id_ = item['id']
    if '-' not in id_:
        token = item['xsec_token']
        print(id_, token)
        headers = {
            'cookie': 'acw_tc=0a00d90517519520293856129e6cb5ea858372dd35e69f821d49648d071928; abRequestId=4805765c-5eab-573e-bb1c-d10416ba4bac; webBuild=4.72.0; xsecappid=xhs-pc-web; loadts=1751952030345; a1=197e87a7a8ax2rvhl6h9ilethxlj7rs4vbm2u82bs50000169918; webId=aedd0ea213fda763d49126083b06d063; websectiga=984412fef754c018e472127b8effd174be8a5d51061c991aadd200c69a2801d6; sec_poison_id=dd75d578-e649-4080-9dd5-f26334c524ce; web_session=030037af5ffed41e5f99ecec142f4a719b7905; gid=yjWdYW0jWWhSyjWdYW0W0DYJY0CJFhx1Kxj36DM1d9xC1J28VWFM47888yKjjyY8S0y4f2fd',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
            'referer': 'https://www.xiaohongshu.com/'
        }

        search_url = 'https://edith.xiaohongshu.com/api/sns/web/v1/search/notes'

        url = f'https://www.xiaohongshu.com/explore/{id_}?xsec_token={token}&xsec_source=pc_user'
        response = requests.get(url=url, headers=headers)
        # 2.获取数据
        html = response.text
        # print(html)
        # 3. 解析数据
        try:
            # 提取标题
            old_title = re.findall(r'<meta name="og:title" content="(.*?)">', html)[0]
            title = re.sub(r'[\\/:*?"<>|\n\r]','',old_title)
            print(title)
            # 提取图片链接
            img_list = re.findall(r'<meta name="og:image" content="(.*?)">', html)
            # 定义序号
            num = 1
            for img in img_list:
                img_content = requests.get(url=img, headers=headers).content
                """保存数据"""
                with open(f'img\\{title}-{num}.jpg', 'wb') as f:
                    # 写入数据
                    f.write(img_content)
                num += 1
        except Exception as e:
            print(e)

Markdown 编辑器