【视频】猫途鹰网站评论数据抓取

优采云 发布时间: 2022-06-05 00:57

  【视频】猫途鹰网站评论数据抓取

  示例网址

  Review-g294212-d1145964-Reviews-HolidayInnExpressBeijingTempleOf_Heaven-Beijing.html

  抓取内容:

  评论文本内容(文本内容、评论时间等)

  评论者信息(昵称、头像、等级、评论记录等)

  网站难点

  猫头鹰网站的评论数据是动态加载类,所以比较难爬,需要我们进行繁琐的分析才能发现url规律。

  

  网址规律分析(长度:10min)

  实战(时间:70min)

  代码

  import requests<br />import re<br />from bs4 import BeautifulSoup<br />import csvimport os<br /><br />#根据url规律,获取该店铺所有的评论数据的url<br />def review_urls(store_url):

   url_container = []

   prepare_url_parts = start_url.split('-')

   for page in range(1,85):

       url = prepare_url_parts[0]+'-'+'or{}-'.format(5*page)+prepare_url_parts[1]

       url_container.append(url)

   return url_container<br /><br /><br />#先从网页中获取src和uid<br /># 返回评论者的等级、评论分布<br />def get_userinfo(src,uid):

   url = 'https://www.tripadvisor.com/MemberOverlay'

   param = {'Mode':'owa',

   'uid':uid,

   'src':src,

   'fus':'false',

   'partner':'false',

   'LsoId':'',

   'metaReferer':'Hotel_Review'}

   resp = requests.post(url,data=param)

   return resp.text    <br />#解析评论者信息<br />def parse_userinfo(page_html):

   try:

       name = re.findall(r'(.*?)',page_html)[0]

   except:

       name = ''

   try:

       contributions = re.findall(r'(\d+) Contributions', page_html)[0]

   except:

       contributions = '0'

   try:

       helpfulness = re.findall(r'(\d+) Helpful votes', page_html)[0]

   except:

       helpfulness = '0'

   bsObj = BeautifulSoup(page_html,'html.parser')

   distirbutions = bsObj.findAll('span',{'class':'rowCountReviewEnhancements rowCellReviewEnhancements'})

   distirbutions = [str(x)[68:-7] for x in distirbutions]

   return {'name':name}<br /><br /><br />#对每一页的评论数据进行网页解析,并保存<br />def parse_review_html(page_urls):

   current_dir = os.getcwd()

   filepath = current_dir+'/test.csv'

   csvfile = open(filepath,'a+',encoding='utf-8',newline='')

   writer = csv.writer(csvfile)

   writer.writerow(('name','comment'))

   for page_url in page_urls:

       resp = requests.get(page_url)

       bsObj = BeautifulSoup(resp.text, 'html.parser')

       reviewer_comment_list = bsObj.findAll('div', {'class':'review hsx_review ui_columns is-multiline is-mobile inlineReviewUpdate provider0'})

       for review_comment in reviewer_comment_list:

           comment = review_comment.contents[1].div.div.contents[2].div.p.get_text()

           src = review_comment.contents[0].div.div.div.attrs['id'].split('_')[-1]

           uid = review_comment.contents[0].div.div.div.div.attrs['class'][-1].split('_')[-1]

           user_page_htm = get_userinfo(src, uid)

           user_detail = parse_userinfo(page_html=user_page_htm)

           name = user_detail['name']

           print(user_detail)

           writer.writerow((name,comment))

       

   csvfile.close()<br /><br />#开始抓取start-url网址的店铺评论数据<br />start_url = 'https://www.tripadvisor.com/Hotel_Review-g294212-d1145964-Reviews-Holiday_Inn_Express_Beijing_Temple_Of_Heaven-Beijing.html'<br />page_urls = review_urls(store_url=start_url)<br />parse_review_html(page_urls)

  代码链接: 密码: fjn2

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线