网页表格抓取(这是一个用来更新公园信息的数据管理系统，怎么办？)

优采云发布时间: 2021-12-04 12:11

　　这是一个用于更新园区信息的数据管理系统，本页面截图主要用于更新站点信息。幸运的是，这个页面的每个元素都可以通过 ID 来定位。它收录三种形式的信息：文本框、下拉列表和复选按钮。我的实现思路是将id对应的定位信息放在excel表中，通过抓取excel表中提供的定位信息和需要更新的对应值来更新数据。

　　这里分别封装了文本框信息输入、下拉列表选择和勾选按钮勾选，然后将相应的方法统一到update方法中来实现。这消除了对每个字段进行单独处理的需要。

　　包装代码：

　　from selenium.webdriver.support.ui import Select

from selenium.common.exceptions import NoSuchElementException, TimeoutException

import traceback

import time

class BasePage:

def __init__(self, driver):

self.driver = driver

# 对查找单个页面元素进行封装。

def find_element(self, by, locator):

by = by.lower()

element = None

if by in ['id', 'name', 'xpath', 'class', 'tag', 'link', 'plink']:

try:

if by == 'id':

element = self.driver.find_element_by_id(locator)

elif by == 'name':

element = self.driver.find_element_by_name(locator)

elif by == 'xpath':

element = self.driver.find_element_by_xpath(locator)

elif by == 'class':

element = self.driver.find_element_by_class_name(locator)

elif by == 'tag':

element = self.driver.find_element_by_tag_name(locator)

elif by == 'link':

element = self.driver.find_element_by_link_text(locator)

elif by == 'plink':

element = self.driver.find_element_by_partial_link_text(locator)

else:

print('Not find the element "{}"!'.format(locator))

return element

except NoSuchElementException as e:

print(traceback.print_exc())

else:

print('Provided a wrong locator "{}"!'.format(locator))

# 对查找多个页面元素进行封装。

def find_elements(self, by, locator):

by = by.lower()

elements = None

if by in ['id', 'name', 'xpath', 'class', 'tag', 'link', 'plink']:

try:

if by == 'id':

elements = self.driver.find_elements_by_id(locator)

elif by == 'name':

elements = self.driver.find_elements_by_name(locator)

elif by == 'xpath':

elements = self.driver.find_elements_by_xpath(locator)

elif by == 'class':

elements = self.driver.find_elements_by_class_name(locator)

elif by == 'tag':

elements = self.driver.find_elements_by_tag_name(locator)

elif by == 'link':

elements = self.driver.find_elements_by_link_text(locator)

elif by == 'plink':

elements = self.driver.find_elements_by_partial_link_text(locator)

else:

print('Not find the element "{}"!'.format(locator))

return elements

except NoSuchElementException as e:

print(traceback.print_exc())

else:

print('Provided a wrong locator "{}"!'.format(locator))

# 点击页面元素

def click(self, by, locator):

element = self.find_element(by, locator)

element.click()

# 输入框输入新信息

def type(self, by, locator, value):

y = [x for x in value if x != '']

if len(y) > 0:

element = self.find_element(by, locator)

element.clear()

element.send_keys(value.strip())

else:

pass

# 下拉菜单通过可见文本进行选择

def select(self, by, locator, text):

y = [x for x in text if x != '']

if len(y) > 0:

element = self.find_element(by, locator)

element_element = Select(element)

element_element.select_by_visible_text(text.strip())

else:

pass

# 复选按钮勾选时我们需要首先勾掉已选选项

def uncheck(self, by, locator, options):

y = [x for x in options if x != '']

if len(y) > 0:

elements = self.find_elements(by, locator)

for element in elements:

element.click()

else:

pass

# 选择excel表格所提供的选项进行勾选

def check(self, by, locator, options):

y = [x for x in options if x != '']

if len(y) > 0:

be_options = options.split(',')

for option in be_options:

element = self.find_element(by, locator.format(option.strip()))

element.click()

else:

pass

# def input(self, left_title, excel_title, by, locator, values):

# y = [x for x in values if x != '']

# if len(y) > 0:

# if left_title == excel_title:

# self.type(by, locator, values)

# else:

# pass

# 根据excel表格提供标题所包含的关键字来决定进行哪种数据操作

def update(self, title, by, values):

y = [x for x in values if x != '']

if len(y) > 0:

if '_Text' in title: # 文本框

field = title.strip().split('_')

locator = field[0]

self.type(by, locator, values)

elif '_Select' in title: # 下拉列表

field = title.strip().split('_')

locator = field[0]

self.select(by, locator, values)

elif '_Option' in title: # 复选按钮

field = title.strip().split('_')

locator = field[0]

self.uncheck('xpath', '//input[@checked="" and contains(@id, "{}__")]'.format(locator), values)

self.check('id', '%s__{}'%locator, values)

else:

print('Please indicate the data type for the title "{}" in Excel!!!'.format(title))

else:

pass

# def set(self, left_title, excel_title, by, locator, text):

# y = [x for x in text if x != '']

# if len(y) > 0:

# if left_title == excel_title:

# self.select(by, locator, text)

# else:

# pass

# 登录系统进行封装方便以后重用

def login_orms(self, url, username, password):

# driver = webdriver.Firefox(executable_path='D:\\Selenium 3.14\\geckodriver.exe')

self.driver.delete_all_cookies()

self.driver.get(url)

self.driver.maximize_window()

time.sleep(2)

self.type('id', 'userName', username)

self.type('id', 'password', password)

self.click('id', 'okBtnAnchor')

time.sleep(2)

# 登录系统后选择Contract

def goto_manager_page(self, url, username, password, contract, page):

self.login_orms(url, username, password)

time.sleep(2)

self.click('xpath', "//option[text()='" + contract + " Contract']") # 通过click也可以对下拉列表进行选择

time.sleep(2)

self.click('link', page)

time.sleep(2)

　　下面是实现代码（没有对操作excel表的封装）

　　from Orms.BasePage import *

import xlrd

""""""

url = 'reserveamerica.com/xxxx'

account_user = 'xxx'

account_password = 'xxx'

wb = xlrd.open_workbook('D://SiteandLoopSetUp.xls')

""""""

sheet1 = wb.sheet_by_index(0)

sheet2 = wb.sheet_by_index(1)

# read provided information in Excel.

contract = sheet1.col_values(0)[1].strip()

park_name = sheet1.col_values(1)[1].strip()

search_ids = sheet1.col_values(2)[1:]

check_in_times = sheet1.col_values(3)[1:]

check_out_times = sheet1.col_values(4)[1:]

cols_num = sheet1.ncols

print('There will be "{}" fields need to update as following:'.format(cols_num))

left_titles = sheet1.row_values(0)[5:cols_num]

print(left_titles)

browser = webdriver.Firefox(executable_path='D:\\Selenium 3.14\\geckodriver.exe')

driver = BasePage(browser)

driver.goto_manager_page(url=url, username=account_user, password=account_password, contract=contract, page='Inventory Manager')

# Input park name and search

driver.type('id', 'FacilitySearchCriteria.facilityName', park_name)

driver.click('xpath', '//a[@aria-label="Search"]')

time.sleep(3)

# Click facility id in search results to facility details page

driver.click('xpath', '//tr[@name="e_Glow"]/td/a')

time.sleep(2)

# Choose Loop/Site Set-up for Facility Details drop-down list

driver.select('id', 'page_name', 'Loop/Site Set-up')

time.sleep(2)

driver.click('xpath', '//a[@tabindex="-1" and @accesskey="S"]')

time.sleep(2)

for x in range(len(search_ids)):

try:

driver.select('id', 'search_type', 'Site ID')

time.sleep(2)

driver.type('id', 'search_value', search_ids[x])

driver.click('id', 'goAnchor')

time.sleep(2)

driver.click('xpath', '//tr[@name="e_Glow"]/td[2]/a')

time.sleep(2)

# in_time and out_time are required fields, need to handle separately.

in_time = check_in_times[x].strip().split(' ')

y = [x for x in in_time if x != '']

if len(y) > 0:

driver.type('id', 'Checkin Time', in_time[0])

driver.select('id', 'Checkin Time_ispm', in_time[1])

else:

pass

out_time = check_out_times[x].strip().split(' ')

y = [x for x in out_time if x != '']

if len(y) > 0:

driver.type('id', 'Checkout Time', out_time[0])

driver.select('id', 'Checkout Time_ispm', out_time[1])

else:

pass

# update left information in excel.

for i in range(len(left_titles)):

title = left_titles[i]

excel_values = sheet1.col_values(5+i)[1:]

driver.update(title, 'id', excel_values[x])

except Exception as e:

raise Exception

else:

driver.click('xpath', '//a[@aria-label="OK"]')

　　以上是我模拟手动操作自动填写页面表单的方法。虽然每次做的时候都需要找到需要更新信息的页面元素ID的值，复制到excel中，但是对于几十种合同和几十种形式的网站类型，还有上千个字段需要单独处理的，就简单多了。

　　当然，肯定有一些不完善的地方，比如强制等待而不是隐形等待。必须有更好的方法来实现这一目标。请指正~~~

0

2021-12-04

网页表格抓取

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

网页表格抓取(这是一个用来更新公园信息的数据管理系统，怎么办？)

0 个评论

发起人

AI时代内容工厂

网页表格抓取(这是一个用来更新公园信息的数据管理系统，怎么办？)

0 个评论

发起人

相关问题