php 抓取网页生成图片(听图阁-专注于Python设计中的selenium异步代码)
优采云 发布时间: 2022-04-12 10:40php 抓取网页生成图片(听图阁-专注于Python设计中的selenium异步代码)
使用selenium可以很方便的获取网页的ajax内容,并且可以模拟用户点击、输入文字等很多操作,在使用scrapy爬取网页的过程中非常有用。
网上有很多文章将selenium集成到scrapy中,但是很少能实现异步爬取。下面的代码重写了scrapy的下载器,实现了selenium的集成和异步。
PhantomJSDownloadHandler在使用时需要添加到配置文件的DOWNLOADER中。
# encoding: utf-8
from __future__ import unicode_literals
from scrapy import signals
from scrapy.signalmanager import SignalManager
from scrapy.responsetypes import responsetypes
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from six.moves import queue
from twisted.internet import defer, threads
from twisted.python.failure import Failure
class PhantomJSDownloadHandler(object):
def __init__(self, settings):
self.options = settings.get('PHANTOMJS_OPTIONS', {})
max_run = settings.get('PHANTOMJS_MAXRUN', 10)
self.sem = defer.DeferredSemaphore(max_run)
self.queue = queue.LifoQueue(max_run)
SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
def download_request(self, request, spider):
"""use semaphore to guard a phantomjs pool"""
return self.sem.run(self._wait_request, request, spider)
def _wait_request(self, request, spider):
try:
driver = self.queue.get_nowait()
except queue.Empty:
driver = webdriver.PhantomJS(**self.options)
driver.get(request.url)
# ghostdriver won't response when switch window until page is loaded
dfd = threads.deferToThread(lambda: driver.switch_to.window(driver.current_window_handle))
dfd.addCallback(self._response, driver, spider)
return dfd
def _response(self, _, driver, spider):
body = driver.execute_script("return document.documentElement.innerHTML")
if body.startswith(""): # cannot access response header in Selenium
body = driver.execute_script("return document.documentElement.textContent")
url = driver.current_url
respcls = responsetypes.from_args(url=url, body=body[:100].encode('utf8'))
resp = respcls(url=url, body=body, encoding="utf-8")
response_failed = getattr(spider, "response_failed", None)
if response_failed and callable(response_failed) and response_failed(resp, driver):
driver.close()
return defer.fail(Failure())
else:
self.queue.put(driver)
return defer.succeed(resp)
def _close(self):
while not self.queue.empty():
driver = self.queue.get_nowait()
driver.close()
以上在scrapy中使用phantomJS实现异步爬取的方法就是编辑器共享的所有内容。希望能给大家一个参考,也希望大家多多支持【听图格-专注Python设计】。