【视频】猫途鹰网站评论数据抓取
优采云 发布时间: 2022-06-05 00:57【视频】猫途鹰网站评论数据抓取
示例网址
Review-g294212-d1145964-Reviews-HolidayInnExpressBeijingTempleOf_Heaven-Beijing.html
抓取内容:
评论文本内容(文本内容、评论时间等)
评论者信息(昵称、头像、等级、评论记录等)
网站难点
猫头鹰网站的评论数据是动态加载类,所以比较难爬,需要我们进行繁琐的分析才能发现url规律。
网址规律分析(长度:10min)
实战(时间:70min)
代码
import requests<br />import re<br />from bs4 import BeautifulSoup<br />import csvimport os<br /><br />#根据url规律,获取该店铺所有的评论数据的url<br />def review_urls(store_url):
url_container = []
prepare_url_parts = start_url.split('-')
for page in range(1,85):
url = prepare_url_parts[0]+'-'+'or{}-'.format(5*page)+prepare_url_parts[1]
url_container.append(url)
return url_container<br /><br /><br />#先从网页中获取src和uid<br /># 返回评论者的等级、评论分布<br />def get_userinfo(src,uid):
url = 'https://www.tripadvisor.com/MemberOverlay'
param = {'Mode':'owa',
'uid':uid,
'src':src,
'fus':'false',
'partner':'false',
'LsoId':'',
'metaReferer':'Hotel_Review'}
resp = requests.post(url,data=param)
return resp.text <br />#解析评论者信息<br />def parse_userinfo(page_html):
try:
name = re.findall(r'(.*?)',page_html)[0]
except:
name = ''
try:
contributions = re.findall(r'(\d+) Contributions', page_html)[0]
except:
contributions = '0'
try:
helpfulness = re.findall(r'(\d+) Helpful votes', page_html)[0]
except:
helpfulness = '0'
bsObj = BeautifulSoup(page_html,'html.parser')
distirbutions = bsObj.findAll('span',{'class':'rowCountReviewEnhancements rowCellReviewEnhancements'})
distirbutions = [str(x)[68:-7] for x in distirbutions]
return {'name':name}<br /><br /><br />#对每一页的评论数据进行网页解析,并保存<br />def parse_review_html(page_urls):
current_dir = os.getcwd()
filepath = current_dir+'/test.csv'
csvfile = open(filepath,'a+',encoding='utf-8',newline='')
writer = csv.writer(csvfile)
writer.writerow(('name','comment'))
for page_url in page_urls:
resp = requests.get(page_url)
bsObj = BeautifulSoup(resp.text, 'html.parser')
reviewer_comment_list = bsObj.findAll('div', {'class':'review hsx_review ui_columns is-multiline is-mobile inlineReviewUpdate provider0'})
for review_comment in reviewer_comment_list:
comment = review_comment.contents[1].div.div.contents[2].div.p.get_text()
src = review_comment.contents[0].div.div.div.attrs['id'].split('_')[-1]
uid = review_comment.contents[0].div.div.div.div.attrs['class'][-1].split('_')[-1]
user_page_htm = get_userinfo(src, uid)
user_detail = parse_userinfo(page_html=user_page_htm)
name = user_detail['name']
print(user_detail)
writer.writerow((name,comment))
csvfile.close()<br /><br />#开始抓取start-url网址的店铺评论数据<br />start_url = 'https://www.tripadvisor.com/Hotel_Review-g294212-d1145964-Reviews-Holiday_Inn_Express_Beijing_Temple_Of_Heaven-Beijing.html'<br />page_urls = review_urls(store_url=start_url)<br />parse_review_html(page_urls)
代码链接: 密码: fjn2