抓取动态网页(Selenium+Python+Selenium环境配置、网页自动化测试系统功能)

优采云 发布时间: 2021-12-24 15:16

  抓取动态网页(Selenium+Python+Selenium环境配置、网页自动化测试系统功能)

  一、Selenium 介绍和配置1、Selenium 介绍

  Selenium 是由 ThoughtWorks 专门为 Web 应用程序编写的验收测试工具。Selenium 测试直接在浏览器中运行,可以模拟真实用户的行为。支持的浏览器包括IE(7、8、9)、Mozilla Firefox、Mozilla Suite等。该工具的主要功能包括:测试与浏览器的兼容性-测试您的应用程序可以看是否能在不同的浏览器和操作系统下运行良好 测试系统功能——创建回归测试来验证软件功能和用户需求。

  2、Selenium+Python 环境配置

  pip install selenium

  二、网页自动化测试1、启动浏览器,打开百度搜索

  from selenium import webdriver

browser = webdriver.Chrome()

browser.get('http://www.baidu.com/')

  2、 定位元素

  input_btn = web.find_element_by_id('kw')

input_btn.send_keys('王者荣耀', Keys.ENTER)

  三、爬取动态网页名言1、网页数据分析3、爬取数据存储

  with open('Saying.csv', 'w', encoding='utf-8')as fp:

fileWrite = csv.writer(fp)

fileWrite.writerow(['名言', '名人'])

fileWrite.writerows(sayingAndAuthor)

web.close()

  4、 爬取数据

  from selenium.webdriver import Chrome

import time

import csv

web = Chrome(r"D:\\DevTools\\Anaconda\\download\\Anaconda3\\Lib\\site-packages\\selenium\\webdriver\\chrome\\chromedriver.exe")

web.get('http://quotes.toscrape.com/js/')

sayingAndAuthor = []

n = 5

for i in range(0, n):

div_list = web.find_elements_by_class_name('quote')

for div in div_list:

saying = div.find_element_by_class_name('text').text

author = div.find_element_by_class_name('author').text

info = [saying, author]

sayingAndAuthor.append(info)

print('成功爬取第' + str(i + 1) + '页')

if i == n-1:

break

web.find_elements_by_css_selector('[aria-hidden]')[-1].click()

time.sleep(2)

with open('Saying.csv', 'w', encoding='utf-8')as fp:

fileWrite = csv.writer(fp)

fileWrite.writerow(['名言', '名人']) # 写入表头

fileWrite.writerows(sayingAndAuthor)

web.close()

  四、爬虫京东网站图书信息

  from selenium.webdriver import Chrome

from selenium.webdriver.common.keys import Keys

web = Chrome(r"D:\\DevTools\\Anaconda\\download\\Anaconda3\\Lib\\site-packages\\selenium\\webdriver\\chrome\\chromedriver.exe")

web.get('https://www.jd.com/')

web.maximize_window()

web.find_element_by_id('key').send_keys('计算机图形学', Keys.ENTER) # 找到输入框输入,回车

  

  web.find_element_by_class_name('pn-next').click() # 点击下一页

  with open('计算机图形学.csv', 'w', encoding='utf-8')as fp:

writer = csv.writer(fp)

writer.writerow(['书名', '价格', '作者', '出版社', '预览图片地址'])

writer.writerows(all_book_info)

  from selenium.webdriver import Chrome

from selenium.webdriver.common.keys import Keys

import time

from lxml import etree

import csv

web = Chrome(r"D:\\DevTools\\Anaconda\\download\\Anaconda3\\Lib\\site-packages\\selenium\\webdriver\\chrome\\chromedriver.exe")

web.get('https://www.jd.com/')

web.maximize_window()

web.find_element_by_id('key').send_keys('计算机图形学', Keys.ENTER)

def get_onePage_info(web):

web.execute_script('window.scrollTo(0, document.body.scrollHeight);')

time.sleep(2)

page_text = web.page_source

# 进行解析

tree = etree.HTML(page_text)

li_list = tree.xpath('//li[contains(@class,"gl-item")]')

book_infos = []

for li in li_list:

book_name = ''.join(

li.xpath('.//div[@class="p-name"]/a/em/text()')) # 书名

price = '¥' + \

li.xpath('.//div[@class="p-price"]/strong/i/text()')[0] # 价格

author_span = li.xpath('.//span[@class="p-bi-name"]/a/text()')

if len(author_span) > 0: # 作者

author = author_span[0]

else:

author = '无'

store_span = li.xpath(

'.//span[@class="p-bi-store"]/a[1]/text()') # 出版社

if len(store_span) > 0:

store = store_span[0]

else:

store = '无'

img_url_a = li.xpath('.//div[@class="p-img"]/a/img')[0]

if len(img_url_a.xpath('./@src')) > 0:

img_url = 'https' + img_url_a.xpath('./@src')[0] # 书本图片地址

else:

img_url = 'https' + img_url_a.xpath('./@data-lazy-img')[0]

one_book_info = [book_name, price, author, store, img_url]

book_infos.append(one_book_info)

return book_infos

def main():

web = Chrome(

r"D:\\DevTools\\Anaconda\\download\\Anaconda3\\Lib\\site-packages\\selenium\\webdriver\\chrome\\chromedriver.exe")

web.get('https://www.jd.com/')

web.maximize_window()

web.find_element_by_id('key').send_keys('计算机图形学', Keys.ENTER) # 找到输入框输入,回车

time.sleep(2)

all_book_info = []

for i in range(0, 3):

all_book_info += get_onePage_info(web)

print('爬取第' + str(i+1) + '页成功')

web.find_element_by_class_name('pn-next').click() # 点击下一页

time.sleep(2)

with open('计算机图形学.csv', 'w', encoding='utf-8')as fp:

writer = csv.writer(fp)

writer.writerow(['书名', '价格', '作者', '出版社', '预览图片地址'])

writer.writerows(all_book_info)

if __name__ == '__main__':

main()

  五、总结

  了解有关在网络上抓取数据的更多信息

  参考文章

  使用Python+Selenium(一)-自动打开百度搜索

  Python+Selenium 动态网页信息抓取

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线