爬取微博评论
流程
代码
python代码
""" python 采集微博评论"""
import csv
import requests
from datetime import datetime
# 爬取目标网址: https://weibo.com/1195242865/Q28CwnQyw
f = open('data.csv', mode='w', newline='',encoding='utf-8-sig')
csv_writer = csv.DictWriter(f, fieldnames=['日期', '昵称', '性别', '地区', '当前位置', '评论', '描述'])
csv_writer.writeheader()
# 模拟浏览器
headers = {
# 检测是否有登陆账号
'cookie':'SUB=_2AkMfsEW7f8NxqwFRmvwWzm7nZIV0yArEieKp7LRgJRMxHRl-yT9xqk0vtRB6NDBrVCarYZjlu_nRNVG5dwvVzEhS-y4r; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5hk-.OY0fp2WOjXH8T5_CQ; XSRF-TOKEN=sfi19qFE_ZpLEQmzlbQJdxiJ; WBPSESS=cbjWrs_UvapT2Pg_hlK_fEjSg_LlZXmo9fnpnvcVnV4awhKssG_jwe_dwh5Cc3Y12ssa2ewzdQ75MzVIPzmQK4SNCQWkNm0aC08ei9RFLqy9covtSs9kLoWQzWq8wuJfIhZ47K1lSIbuDVl9JF3jMmvOh-_ANGhPI0Roylm6lt8=',
# 浏览器和设备的基本身份信息
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0',
# 防盗链
# 当浏览器请求某个资源(如图片)时,它会在 HTTP 请求头中附带一个 Referer 字段,表示该请求是从哪个网页发起的。防盗链机制就是通过检查这个 Referer 来判断请求是否来自合法来源.
'referer':'https://weibo.com/1195242865/Q28CwnQyw'
}
# 接口地址
url = 'https://weibo.com/ajax/statuses/buildComments'
# 载荷,即接口地址?后面的参数
params = {
'is_reload':'1',
'id':'5205097965683572',
'is_show_bulletin':'2',
'is_mix':'0',
'count':'10',
'uid':'1195242865',
'fetch_level':'0',
'locale':'en-US'
}
# 发送请求
response = requests.get(url=url, headers=headers, params=params)
json_data = response.json()
# print(json_data)
comment_list = json_data['data']
comment_info = []
for index in comment_list:
format_str = '%a %b %d %H:%M:%S %z %Y'
dt = datetime.strptime(index['created_at'], format_str)
date = dt.strftime('%Y/%m/%d %H:%M:%S')
gender = index['user']['gender']
if gender == 'f':
sex = '女'
elif gender == 'm':
sex = '男'
else:
sex = '未知'
dit = {
'日期':date,
'昵称':index['user']['screen_name'],
'性别':sex,
'地区':index['source'],
'当前位置':index['user']['location'],
'评论':index['text'],
'描述': index['user']['description']
}
print(dit)
comment_info.append(dit)
for comment in comment_info:
csv_writer.writerow(comment)
# print(index)