抓取动态网页(google实现爬虫,有多种定位方法?(图) )
优采云 发布时间: 2021-09-23 13:34抓取动态网页(google实现爬虫,有多种定位方法?(图)
)
这个过程更麻烦,你需要采取几步完成,你会介绍:
1.安装selenium库,可以直接使用'pip安装selenium'命令安装。
2.下载ChromedRiver并将其添加到环境变量,或直接将.exe文件放入Python安装目录脚本文件夹中。下载时,您必须选择与浏览器对应的版本。查看浏览器版本是:右上角 - >帮助 - >关于Google Chrome可以查看,下载驱动程序的地址是ChromedRiver下载地址,与Chrome版本相应的关系相应,验证下载和安装是否成功,只需要执行以下代码。如果没有错误,它将被成功安装。
import selenium.webdriver as driver
index = driver.Chrome()
index.get('https://www.wdzj.com/dangan')
print(index)
3.硒实现爬行动物,有多种定位方法,简要介绍:
官方网站地址是:
找到一个元素方法:
找到多个元素(返回列表)方法:
4.基本功能在你有一个快乐的爬行动物之后(因为你没有搅拌,首先把一些代码放在那里)!
import selenium.webdriver as driver
import xlwt
import types
URL = 'https://www.wdzj.com/dangan/'
# KEYWORD = '银行存管'
def key_word():
# index = driver.Chrome()
# index.get(URL)
# select_data = index.find_elements_by_xpath('//*[@id="showTable"]/ul/li/div[1]/h2/a')
# print(index.current_url)
# keyword_index = index.find_element_by_link_text()
# keyword_index.click()
names = []
banks = []
tel_nums = []
urls = []
for i in range(0, 76):
page_url = URL + 'search?filter=e1¤tPage=' + str(i + 1)
index_page = driver.Chrome()
index_page.get(page_url)
select_data = index_page.find_elements_by_xpath('//*[@id="showTable"]/ul/li/div[1]/h2/a')
print(index_page.current_url)
for data in select_data:
names.append(data.text)
print(names) #名字
sec_url = data.get_attribute("href")
index_sec = driver.Chrome()
index_sec.get(sec_url)
# print(index_sec.current_url) #链接
yhcg = index_sec.find_element_by_xpath('/html/body/div[10]/div/div[1]/div[1]/dl[1]/dd[2]/div[2]')
banks.append(yhcg.text)
# print(banks) #银行存管
tel_num = index_sec.find_element_by_link_text('*敏*感*词*')
tel_num.click()
number = index_sec.find_element_by_xpath('//*[@class="da-lxfs zzfwbox"]/dl[1]/dd[1]/div[2]')
tel_nums.append(number.text)
# print(tel_nums) #客服电话
yuming = index_sec.find_element_by_link_text('工商/备案')
yuming.click()
yu_beian = index_sec.find_element_by_xpath('//*[@class="lcen"]/table/tbody/tr[7]/td[2]')
urls.append(yu_beian.text)
print(urls) #域名
index_sec.close()
# print(page_url)
# next_page = index.find_element_by_link_text('下一页')
# next_page.click()
return names, banks, tel_nums, urls
def xls():
wb = xlwt.Workbook()
ws = wb.add_sheet('numbers')
ws.write(0, 0, '序号')
ws.write(0, 1, '公司名称')
ws.write(0, 2, '银行存管')
ws.write(0, 3, '客服电话')
ws.write(0, 4, '公司域名')
names, banks, tel_nums, urls = key_word()
print(len(names))
for i in range (0, len(names)):
ws.write(i + 1, 0, i+1)
ws.write(i + 1, 1, names[i])
ws.write(i + 1, 2, banks[i])
ws.write(i + 1, 3, tel_nums[i])
ws.write(i + 1, 4, urls[i])
wb.save('D:\\number.xls')
def run():
xls()
run()
更改:
import selenium.webdriver as driver
import xlwt
from xlutils.copy import copy
import xlrd
URL = 'https://www.wdzj.com/dangan/'
# KEYWORD = '银行存管'
def key_word():
names = []
banks = []
tel_nums = []
urls = []
count= 0
wb = xlwt.Workbook()
ws = wb.add_sheet('numbers')
ws.write(0, 0, '序号')
ws.write(0, 1, '公司名称')
ws.write(0, 2, '银行存管')
ws.write(0, 3, '客服电话')
ws.write(0, 4, '公司域名')
wb.save('D:\\number.xls')
for i in range(0, 76):
page_url = URL + 'search?filter=e1¤tPage=' + str(i + 1)
index_page = driver.Chrome()
index_page.get(page_url)
select_data = index_page.find_elements_by_xpath('//*[@id="showTable"]/ul/li/div[1]/h2/a')
print(index_page.current_url)
for data in select_data:
names.append(data.text)
print(names) #名字
sec_url = data.get_attribute("href")
index_sec = driver.Chrome()
index_sec.get(sec_url)
# print(index_sec.current_url) #链接
yhcg = index_sec.find_element_by_xpath('//*[@class="bgbox-bt zzfwbox"]/dl/dd/div[@class="r" and contains(text(),"存管")]')
banks.append(yhcg.text)
print(banks) #银行存管
tel_num = index_sec.find_element_by_link_text('*敏*感*词*')
tel_num.click()
number = index_sec.find_element_by_xpath('//*[@class="da-lxfs zzfwbox"]/dl[1]/dd[1]/div[2]')
tel_nums.append(number.text)
# print(tel_nums) #客服电话
yuming = index_sec.find_element_by_link_text('工商/备案')
yuming.click()
yu_beian = index_sec.find_element_by_xpath('//*[@class="lcen"]/table/tbody/tr[7]/td[2]')
urls.append(yu_beian.text)
print(urls) #域名
oldWb =xlrd.open_workbook('D:\\number.xls', formatting_info=True)
newWb = copy(oldWb)
news = newWb.get_sheet(0)
news.write(count + 1, 0, count + 1)
news.write(count + 1, 1, names[count])
news.write(count + 1, 2, banks[count])
news.write(count + 1, 3, tel_nums[count])
news.write(count + 1, 4, urls[count])
newWb.save('D:\\number.xls')
print(count)
count+=1
index_sec.close()
index_page.close()
return names, banks, tel_nums, urls
def run():
key_word()
run()