Python集成代码实现了优采云爬行知乎的所有功能以及附加的数据预处理
优采云 发布时间: 2020-08-09 06:23社交: 充分利用最好的机会!了解采集器集成代码的实现! (2020年7月29日)
上一篇文章(上面的链接)对每个部分进行了更详细的描述. 本文将介绍用于爬网和爬网数据的预处理的集成代码块.
1.python集成代码,实现了优采云爬行之虎的所有功能
```python
#!/usr/bin/env python
# coding: utf-8
import os
import pandas as pd
from selenium import webdriver
from lxml import etree
import time
import jieba
import re
import numpy as np
url1 = input("请输入您所需要爬取的网页(知乎)")
browser = webdriver.Chrome("/Users/apple/Downloads/chromedrivermac")
browser.get(url1)
try:
#点击问题全部内容
button1 = browser.find_elements_by_xpath("""//div[@class= "QuestionHeader-detail"]
//button[contains(@class,"Button") and contains(@class,"QuestionRichText-more")
and contains(@class , "Button--plain")
]""")[0]
button1.click()
except:
print('这个问题比较简单,并没有问题的全部内容哦!')
#此网页就属于异步加载的情况
#那么我们就需要多次下滑
for i in range(20):
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(0.5)
print(i)
#点击知乎的登陆弹窗
button2 = browser.find_elements_by_xpath("""//button[@aria-label = '关闭']""")[0]
button2.click()
#点击知乎的“查看全部回答”按钮
button3 = browser.find_elements_by_xpath("""//div[@class = 'Question-main']
//a[contains(@class,"ViewAll-QuestionMainAction") and contains(@class , "QuestionMainAction") ]""")[1]
button3.click()
final_end_it = browser.find_elements_by_xpath("""//button[contains(@class,"Button")
and contains(@class ,'QuestionAnswers-answerButton')
and contains(@class ,'Button--blue')
and contains(@class ,'Button--spread')
]""")
while final_end_it == []:
final_end_it = browser.find_elements_by_xpath("""//button[contains(@class,"Button")
and contains(@class ,'QuestionAnswers-answerButton')
and contains(@class ,'Button--blue')
and contains(@class ,'Button--spread')
]""")
js="var q=document.documentElement.scrollTop=0"
browser.execute_script(js)
for i in range(30):
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(0.5)
print(i)
dom = etree.HTML(browser.page_source)
# 对于问题本身的数据
Followers_number_first = dom.xpath("""//div[@class="QuestionFollowStatus"]//div[@class = "NumberBoard-itemInner"]/strong/text()""")[0]
Browsed_number_first = dom.xpath("""//div[@class="QuestionFollowStatus"]//div[@class = "NumberBoard-itemInner"]/strong/text()""")[1]
#关注者数量
Followers_number_final = re.sub(",","",Followers_number_first)
#浏览数量
Browsed_number_final = re.sub(",","",Browsed_number_first)
#问题链接
problem_url = url1
#问题ID
problem_id = re.findall(r"\d+\.?\d*",url1)
#问题标题
problem_title = dom.xpath("""//div[@class = 'QuestionHeader']//h1[@class = "QuestionHeader-title"]/text()""")
#问题点赞数
problem_endorse = dom.xpath("""//div[@class = 'QuestionHeader']//div[@class = "GoodQuestionAction"]/button/text()""")
#问题评论数
problem_Comment = dom.xpath("""//div[@class = 'QuestionHeader']//div[@class = "QuestionHeader-Comment"]/button/text()""")
#问题回答数
answer_number = dom.xpath("""//div[@class = 'Question-main']//h4[@class = "List-headerText"]/span/text()""")
#问题标签
problem_tags_list = dom.xpath("""//div[@class = 'QuestionHeader-topics']//a[@class = "TopicLink"]/div/div/text()""")
# 对于回答本身的数据
#具体内容
comment_list = dom.xpath("""//div[@class = 'List-item']//div[@class = "RichContent-inner"]""")
comment_list_text = []
for comment in comment_list:
comment_list_text.append(comment.xpath("string(.)"))
#发表时间
time_list = dom.xpath("""//div[@class = 'List-item']//div[@class = "ContentItem-time"]//span/@data-tooltip""")
edit_time_list = dom.xpath("""//div[@class = 'List-item']//div[@class = "ContentItem-time"]//span/text()""")
#点赞数
endorse_list = dom.xpath("""//div[@class = 'List-item']//button[contains(@class,"Button") and contains(@class,"VoteButton") and contains(@class , "VoteButton--up")]/@aria-label""")
#评论人数
number_of_endorse_list = dom.xpath("""//div[@class = 'List-item']//svg[contains(@class,"Zi") and contains(@class,"Zi--Comment")
and contains(@class,"Button-zi")]/../../text()""")
#回答链接
answers_url_list = dom.xpath("""//div[@class = 'List-item']//div[contains(@class,"ContentItem") and contains(@class,"AnswerItem")]
/meta[@itemprop = "url"]/@content""")
authors_list = dom.xpath("""//div[@class = 'List-item']//div[contains(@class,"ContentItem") and contains(@class,"AnswerItem")]
/@data-zop""")
#作者姓名
authorName_list = []
#作者id
authorid_list = []
for i in authors_list:
authorName_list.append(eval(i)['authorName'])
authorid_list.append(eval(i)["itemId"])
# 合成数据框
data = pd.DataFrame()
data['具体内容'] = comment_list_text
data["发表时间"] = time_list
data["点赞数"] = endorse_list
data["评论人数"] = number_of_endorse_list
data["回答链接"] = answers_url_list
data["作者姓名"] = authorName_list
data['作者id'] = authorid_list
data["问题关注者数量"] = Followers_number_final
data["问题浏览数量"] = Browsed_number_final
data["问题链接"] = problem_url
data["问题ID"] = problem_id[0]
data["问题标题"] = problem_title[0]
data["问题点赞数"] = problem_endorse[0]
data["问题评论数"] = problem_Comment[0]
data["问题回答数"] = answer_number[0]
data["问题标签"] = "&".join(problem_tags_list)
data
复制上面的代码,配置chromedriver环境,输入需要抓取的网页,然后等待抓取完成.
2. 简单的数据清理
def str_to_number(str1):
mid = re.findall(r"\d+\.?\d*",str1)
if mid != []:
return mid[0]
else:
return 0
data["点赞数"] = data["点赞数"].apply(str_to_number)
data["评论人数"] = data["评论人数"].apply(str_to_number)
data["问题点赞数"] = data["问题点赞数"].apply(str_to_number)
data["问题评论数"] = data["问题评论数"].apply(str_to_number)
data["问题回答数"] = data["问题回答数"].apply(str_to_number)
def time_to_datetime(x):
x1 = re.sub('[\u4e00-\u9fa5]', '',x)
if len(x1) < 15 :
#15的根据是data["发表时间_1"] = data["发表时间"].apply(lambda x : re.sub('[\u4e00-\u9fa5]', '',x))
#data["发表时间_1"].apply(lambda x : len(x)).value_counts()
x2 = re.sub(' ', '2020-',x1,count=1)
return x2
return x1
data["发表时间"] = data["发表时间"].apply(time_to_datetime)
data.sort_values('发表时间', inplace=True)
data = data.reset_index(drop = True)
data
3. 使用“问题标题”存储数据