网页表格抓取(这是一个用来更新公园信息的数据管理系统,怎么办?)
优采云 发布时间: 2021-12-04 12:11网页表格抓取(这是一个用来更新公园信息的数据管理系统,怎么办?)
这是一个用于更新园区信息的数据管理系统,本页面截图主要用于更新站点信息。幸运的是,这个页面的每个元素都可以通过 ID 来定位。它收录三种形式的信息:文本框、下拉列表和复选按钮。我的实现思路是将id对应的定位信息放在excel表中,通过抓取excel表中提供的定位信息和需要更新的对应值来更新数据。
这里分别封装了文本框信息输入、下拉列表选择和勾选按钮勾选,然后将相应的方法统一到update方法中来实现。这消除了对每个字段进行单独处理的需要。
包装代码:
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import traceback
import time
class BasePage:
def __init__(self, driver):
self.driver = driver
# 对查找单个页面元素进行封装。
def find_element(self, by, locator):
by = by.lower()
element = None
if by in ['id', 'name', 'xpath', 'class', 'tag', 'link', 'plink']:
try:
if by == 'id':
element = self.driver.find_element_by_id(locator)
elif by == 'name':
element = self.driver.find_element_by_name(locator)
elif by == 'xpath':
element = self.driver.find_element_by_xpath(locator)
elif by == 'class':
element = self.driver.find_element_by_class_name(locator)
elif by == 'tag':
element = self.driver.find_element_by_tag_name(locator)
elif by == 'link':
element = self.driver.find_element_by_link_text(locator)
elif by == 'plink':
element = self.driver.find_element_by_partial_link_text(locator)
else:
print('Not find the element "{}"!'.format(locator))
return element
except NoSuchElementException as e:
print(traceback.print_exc())
else:
print('Provided a wrong locator "{}"!'.format(locator))
# 对查找多个页面元素进行封装。
def find_elements(self, by, locator):
by = by.lower()
elements = None
if by in ['id', 'name', 'xpath', 'class', 'tag', 'link', 'plink']:
try:
if by == 'id':
elements = self.driver.find_elements_by_id(locator)
elif by == 'name':
elements = self.driver.find_elements_by_name(locator)
elif by == 'xpath':
elements = self.driver.find_elements_by_xpath(locator)
elif by == 'class':
elements = self.driver.find_elements_by_class_name(locator)
elif by == 'tag':
elements = self.driver.find_elements_by_tag_name(locator)
elif by == 'link':
elements = self.driver.find_elements_by_link_text(locator)
elif by == 'plink':
elements = self.driver.find_elements_by_partial_link_text(locator)
else:
print('Not find the element "{}"!'.format(locator))
return elements
except NoSuchElementException as e:
print(traceback.print_exc())
else:
print('Provided a wrong locator "{}"!'.format(locator))
# 点击页面元素
def click(self, by, locator):
element = self.find_element(by, locator)
element.click()
# 输入框输入新信息
def type(self, by, locator, value):
y = [x for x in value if x != '']
if len(y) > 0:
element = self.find_element(by, locator)
element.clear()
element.send_keys(value.strip())
else:
pass
# 下拉菜单通过可见文本进行选择
def select(self, by, locator, text):
y = [x for x in text if x != '']
if len(y) > 0:
element = self.find_element(by, locator)
element_element = Select(element)
element_element.select_by_visible_text(text.strip())
else:
pass
# 复选按钮勾选时我们需要首先勾掉已选选项
def uncheck(self, by, locator, options):
y = [x for x in options if x != '']
if len(y) > 0:
elements = self.find_elements(by, locator)
for element in elements:
element.click()
else:
pass
# 选择excel表格所提供的选项进行勾选
def check(self, by, locator, options):
y = [x for x in options if x != '']
if len(y) > 0:
be_options = options.split(',')
for option in be_options:
element = self.find_element(by, locator.format(option.strip()))
element.click()
else:
pass
# def input(self, left_title, excel_title, by, locator, values):
# y = [x for x in values if x != '']
# if len(y) > 0:
# if left_title == excel_title:
# self.type(by, locator, values)
# else:
# pass
# 根据excel表格提供标题所包含的关键字来决定进行哪种数据操作
def update(self, title, by, values):
y = [x for x in values if x != '']
if len(y) > 0:
if '_Text' in title: # 文本框
field = title.strip().split('_')
locator = field[0]
self.type(by, locator, values)
elif '_Select' in title: # 下拉列表
field = title.strip().split('_')
locator = field[0]
self.select(by, locator, values)
elif '_Option' in title: # 复选按钮
field = title.strip().split('_')
locator = field[0]
self.uncheck('xpath', '//input[@checked="" and contains(@id, "{}__")]'.format(locator), values)
self.check('id', '%s__{}'%locator, values)
else:
print('Please indicate the data type for the title "{}" in Excel!!!'.format(title))
else:
pass
# def set(self, left_title, excel_title, by, locator, text):
# y = [x for x in text if x != '']
# if len(y) > 0:
# if left_title == excel_title:
# self.select(by, locator, text)
# else:
# pass
# 登录系统进行封装方便以后重用
def login_orms(self, url, username, password):
# driver = webdriver.Firefox(executable_path='D:\\Selenium 3.14\\geckodriver.exe')
self.driver.delete_all_cookies()
self.driver.get(url)
self.driver.maximize_window()
time.sleep(2)
self.type('id', 'userName', username)
self.type('id', 'password', password)
self.click('id', 'okBtnAnchor')
time.sleep(2)
# 登录系统后选择Contract
def goto_manager_page(self, url, username, password, contract, page):
self.login_orms(url, username, password)
time.sleep(2)
self.click('xpath', "//option[text()='" + contract + " Contract']") # 通过click也可以对下拉列表进行选择
time.sleep(2)
self.click('link', page)
time.sleep(2)
下面是实现代码(没有对操作excel表的封装)
from Orms.BasePage import *
import xlrd
""""""
url = 'reserveamerica.com/xxxx'
account_user = 'xxx'
account_password = 'xxx'
wb = xlrd.open_workbook('D://SiteandLoopSetUp.xls')
""""""
sheet1 = wb.sheet_by_index(0)
sheet2 = wb.sheet_by_index(1)
# read provided information in Excel.
contract = sheet1.col_values(0)[1].strip()
park_name = sheet1.col_values(1)[1].strip()
search_ids = sheet1.col_values(2)[1:]
check_in_times = sheet1.col_values(3)[1:]
check_out_times = sheet1.col_values(4)[1:]
cols_num = sheet1.ncols
print('There will be "{}" fields need to update as following:'.format(cols_num))
left_titles = sheet1.row_values(0)[5:cols_num]
print(left_titles)
browser = webdriver.Firefox(executable_path='D:\\Selenium 3.14\\geckodriver.exe')
driver = BasePage(browser)
driver.goto_manager_page(url=url, username=account_user, password=account_password, contract=contract, page='Inventory Manager')
# Input park name and search
driver.type('id', 'FacilitySearchCriteria.facilityName', park_name)
driver.click('xpath', '//a[@aria-label="Search"]')
time.sleep(3)
# Click facility id in search results to facility details page
driver.click('xpath', '//tr[@name="e_Glow"]/td/a')
time.sleep(2)
# Choose Loop/Site Set-up for Facility Details drop-down list
driver.select('id', 'page_name', 'Loop/Site Set-up')
time.sleep(2)
driver.click('xpath', '//a[@tabindex="-1" and @accesskey="S"]')
time.sleep(2)
for x in range(len(search_ids)):
try:
driver.select('id', 'search_type', 'Site ID')
time.sleep(2)
driver.type('id', 'search_value', search_ids[x])
driver.click('id', 'goAnchor')
time.sleep(2)
driver.click('xpath', '//tr[@name="e_Glow"]/td[2]/a')
time.sleep(2)
# in_time and out_time are required fields, need to handle separately.
in_time = check_in_times[x].strip().split(' ')
y = [x for x in in_time if x != '']
if len(y) > 0:
driver.type('id', 'Checkin Time', in_time[0])
driver.select('id', 'Checkin Time_ispm', in_time[1])
else:
pass
out_time = check_out_times[x].strip().split(' ')
y = [x for x in out_time if x != '']
if len(y) > 0:
driver.type('id', 'Checkout Time', out_time[0])
driver.select('id', 'Checkout Time_ispm', out_time[1])
else:
pass
# update left information in excel.
for i in range(len(left_titles)):
title = left_titles[i]
excel_values = sheet1.col_values(5+i)[1:]
driver.update(title, 'id', excel_values[x])
except Exception as e:
raise Exception
else:
driver.click('xpath', '//a[@aria-label="OK"]')
以上是我模拟手动操作自动填写页面表单的方法。虽然每次做的时候都需要找到需要更新信息的页面元素ID的值,复制到excel中,但是对于几十种合同和几十种形式的网站类型,还有上千个字段需要单独处理的,就简单多了。
当然,肯定有一些不完善的地方,比如强制等待而不是隐形等待。必须有更好的方法来实现这一目标。请指正~~~