使用Python爬取知乎回答数据

优采云 发布时间: 2022-09-20 19:48

  使用Python爬取知乎回答数据

  import os<br />import json<br />import requests<br />import pandas as pd<br />from bs4 import BeautifulSoup<br /><br />def fetch(next_url):<br /> url = next_url<br /> headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',<br /> 'cookie':' Hm_lvt_98beee57fd2ef70ccdd5ca52b9740*敏*感*词*9=1662619350,1662729414,1662813326,1662863065; NOT_UNREGISTER_WAITING=1; __snaker__id=aaLaUJhoCrcgia6P; gdxidpyhxdE=Z4jGxmeqi1HC5PnRKj\d3E\WEnEWkBkK1tpmdQvu6RzwHwXylDsE+fs\tvockccCr9Ab6avhglhdOzLT\37694bL5J\VW\yScB768crPnDtUt85ocmNzD51Wy8UNl69wsyb4y9jyNuGA/izfp+zEJNN/JPhjfJswkP5CMloiLtCgJ/BJ:1662864511703; YD00517437729195:WM_NI=tGzooxLbrz3g7PifRTCu05vFcH+zU5/vTlFvj5KWyGYr0fduFLvW/GVno0Jg90xvhQeo4+fIpjSV6+9EjI78CyP2DCG4X2g9mK1yoKuKeSnqQCh223jYOC/epIZ9YLXZem0=; YD00517437729195:WM_NIKE=9ca17ae2e6ffcda170e2e6ee9bc83c8d9987b9d35aa6ef8fa2*敏*感*词*4a938a8a82d54da5a68387c77087e9e1d5f22af0fea7c3b92ab899b690ee7490bebe9bb26df590a787d16081ab00d6f54ea198e59bd83fedaa9c8dc57eaaeb85b8f*敏*感*词*d9bae0087e77f8694bab8ae4d96b2a1d1c96dfbf0b897eb5ba7b1fccce543b4b69db7b521fb9ba6aab63481a7a0d3c74985ec98d3e633af95ff9aee41b795abb4ed7b939da8d3b139aa8781a4ae6fb49796dad*敏*感*词*89c86ab8eb337e2a3; captcha_session_v2=2|1:0|10:1662863756|18:captcha_session_v2|88:ZjZEdWhuanlMNHlZOEMzUURLUExmNHVHTkVpTVE5N2pScG5uR3M2blJwbktmc201MWk5Z3FpSTNyVUR2U1gzZg==|58304cc84b91db866170982c9f5431c528f986d61abb523a0b4839a5a1a8c2e2; captcha_ticket_v2=2|1:0|10:1662863773|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfN3RlRWpGVEhJU1dfbzRQZkt6R3dieVhsbWltQTlyMjJSMndXblNsVDJvUGR0Z1VzMERWSG8wMm9ZUUdtNzFoYy0wLWJFMFZ4aXhUNDlKVEx6MUFTbXlwTVpsMC5QU2xzWERmencyYS0uVnBwUDZFQ2F4b3RJLmZhT1lDbkQwV3ZCMlNpRE9mcHVrTExibFExMURtbXpFRU16Ni5GR21lWXBOcTdoX1NSY0hfVndQcFpqVWh5VUQtc05EYlVBdFJZWWQuc1ZyUk5IOXRqWV9DMXRLb1FFLjE5SFVoQ2hDcmQ1YjFjbWlUMEVWV094VlZtUHNYc1BjZk5hdERCaXhYQjA3bDVlUm92Wi1kR3NjbERDOFFpTDFVZ3gxaDdmYVZlOC5zbEx2WHBiQkR2OFVpcTg5T2VyLi1pbFRUOFRpS284dzVrQW1DRVBtMEs0bTlzTXFBQUtpd0dVbjFJSHdRam1kWHc5Mi5xMDBfOXdxOUVJUHJkdWZWbGVRdjZOVUlTNDJ4eWNyc0ZVbENJdDJJU3g5WTJEdWlwc01PUy4yRVo0WXN0QUVZMTh6MWU4RjZZNGJwTi02Y3dPQVIuT2pyZlpkSlVJdWk5c2JWMFJBOXUySGFZNDd5c2t6YjZhUGNTck9tdy5OeHJsa2wuX1Bub3Z0cll5MXk2eTFTMyJ9|a16af6aa89bcea98828d64565d9f7533d1555fede7f5af4ef6f4df2863fcda08; z_c0=2|1:0|10:1662863925|4:z_c0|92:Mi4xbzBfSlBRQUFBQUFBNE5EalpCUHdFU1lBQUFCZ0FsVk5OWmdLWkFCUkdXeWVqVUhRODU5bVRmaVRIRXl2VEJ5MWdB|4191bc8e9a3b0f620ddc0a499e3de9588e5437ed744448a779c0ddc3fc7d3aa1; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740*敏*感*词*9=1662864218; KLBRSID=975d56862ba86eb589d21e89c8d1e74e|1662864248|1662863060'}<br /><br /> res = requests.get(url, headers=headers)<br /> return res.json()<br /># y = fetch()<br /># print(y)<br />def parse(y):<br /> data = y['data'] #data = y['data']= res.json()['data']<br /> next_url = y['paging']['next']<br /> comment_all = []<br /> for item in data:<br /> author = item['target']['author']['name']<br /> content = item['target']['excerpt']<br /> comment_data = [author, content]<br /> comment_all.append(comment_data)<br /> return comment_all, next_url<br />def save(data, path, filename):<br /><br /> if not os.path.exists(path):<br /> os.makedirs(path)<br /> df = pd.DataFrame(data)<br /> df.to_csv(path+filename, encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)<br /><br />if __name__ == '__main__':<br /> next_url = 'https://www.zhihu.com/api/v4/questions/63014725/feeds?cursor=e3cd4f087b3e3cd07c3325d6c31d961b&include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,is_labeled,paid_info,paid_info_content,reaction_instruction,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_recognized;data[*].mark_infos[*].url;data[*].author.follower_count,vip_info,badge[*].topics;data[*].settings.table_of_content.enabled&limit=5&offset=0&order=default&platform=desktop&session_id=1662864215041635408'<br /> path = 'F:/知乎/'<br /> filename = 'zhihu.csv'<br /> csvHeader =[['用户昵称', '评论内容']]<br /> save(csvHeader, path, filename)#csvheader在这里代替了data被写入到了pd.DateFrame()<br /> while(True):<br /> y = fetch(next_url)<br /> comment_all, next_url = parse(y)<br /> save(comment_all, path, filename)<br /> if next_url == 0:<br /> break<br />

  注意点:

  ①要在headers里面添加cookie,否则返回的是一个空的json数据,建议使用小号的cookie,否则被封号后果自负啊 [狗头.jpg]。

  ②既往的爬取知乎数据的代码主要是利用offset这个偏移量,不过目前它不再变化了。观察Request URL的规律,发现它多了一个变化的cursor参数,而这个cursor参数就是我们的突破口。在抓取的数据包中,我们在['paging']['next']中找到了下一个数据包的网址,我们只需要把下一页的网址不断传入fetch()函数中就可以不断的获取数据包,然后通过获取到的数据包解析我们需要的内容和下一页的url,然后不断循环就能实现爬取答案的目的。

  ③在提取内容时,一个是content,一个是excerpt,excerpt是摘录的意思,内容并不完整,但目前提取content老是报错,我也不知道为啥hhh

  ④退出循环的条件可能并不满足,不过代码还在跑,跑到最后它自己就报错退出了,我觉得不影响使用就没管它hhh

  昨天发了一篇关于微博的推文,不过没看到推送,今天再发一遍。

  import os<br />import json<br />import requests<br />import pandas as pd<br />from bs4 import BeautifulSoup<br />def fetch(max_id):<br /> url = 'https://weibo.com/ajax/statuses/buildComments'<br /> headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}<br /> params = {'flow': 0,<br /> 'is_reload': 1,<br /> 'id': 4811924297221505,<br /> 'is_show_bulletin': 2,<br /> 'is_mix': 0,<br /> 'max_id': max_id,<br /> 'count': 20,<br /> 'uid': 2803301701}<br /> res = requests.get(url, headers=headers, params=params)<br /> return res.json()<br /># y = fetch()<br /># print(y)<br />def parse(y): #y = fetch()=res.json()<br /> data = y['data'] #y['data']=res.json()['data']<br /> max_id = y['max_id']<br /> comment_all = []<br /> for item in data:<br /> name = item['user']['screen_name']<br /> content = BeautifulSoup(item['text'], 'html.parser').text<br /> date = item['created_at']<br /> likes = item['like_counts']<br /> ip = item['user']['location']<br /> comment_data = [name, content, date, likes, ip]<br /> comment_all.append(comment_data)<br /> return comment_all, max_id<br />def save(data, path, filename):<br /><br /> if not os.path.exists(path):<br /> os.makedirs(path)<br /> df = pd.DataFrame(data)#data参数<br /> df.to_csv(path+filename, encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)<br /><br />if __name__ == '__main__':#注意是两个==<br /> max_id = 0<br /> path = 'F:/微博/'<br /> filename = 'weibo.csv'<br /> csvHeader = [['用户昵称', '评论内容', '评论时间', '被点赞数', '所在城市']]<br /> save(csvHeader, path, filename)#csvheader在这里代替了data被写入了pd.DateFrame()<br /> while(True):<br /> y = fetch(max_id)<br /> comment_all, max_id = parse(y)<br /> save(comment_all, path, filename)<br /> if max_id == 0:<br /> break<br />

  大体思路:

  

  分析网址构造——获取网页数据——解析数据——存储数据。

  ①分析网址构造:

  静态的还是动态的?

  规律:先翻页,后抓包。xhr还是doc?

  微博评论是Ajax动态渲染出来的。

  如何应对Ajax?

  发现请求头的规律,构造函数,伪装成浏览器给服务器发送请求以获得数据。

  ②获得网页数据

  Requests URL 自“?”之后皆为参数,将其写入params中。

  Request Method为get,

  

  使用requests()函数以获取数据,并将返回的函数res.json()

  ③解析数据

  寻找需要的数据,然后遍历,并保存到列表中。

  ④保存数据

  使用dateframe生成一个csv文件,并先行设置好csv的header,当max_id = 0 时退出循环。

  详细内容参见【机灵鹤】公众号,这个博主超赞的!!!

  最后看看结果:

  知乎的↓

  微博的↓

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线