动态网页抓取(关于with语句与上下文管理器三、思路整理(组图))

优采云发布时间: 2021-11-21 06:09

　　退出首页，在目标页面()下，如图，可以通过选择我们设置的charles来设置全局代理（网站的所有数据请求都经过代理）

　　也可以只为这个网站设置代理

　　二、知识储备

　　requests.get()：请求官方文档

　　在这里，在 request.get() 中，我使用了两个附加参数 verify=False, stream=True

　　verify = False 可以绕过网站的SSL验证

　　stream=True 保持流打开，直到流关闭。在下载图片的过程中，可以在图片下载完成之前保持数据流不关闭，以保证图片下载的完整性。如果去掉这个参数，重新下载图片，会发现图片无法下载成功。

　　contextlib.closure()：contextlib 库下的关闭方法，其作用是将一个对象变成一个上下文对象来支持。

　　使用with open()后，我们知道with的好处是可以帮助我们自动关闭资源对象，从而简化代码。其实任何对象，只要正确实现了上下文管理，都可以在with语句中使用。还有一篇关于 with 语句和上下文管理器的文章

　　三、思想的组织

　　这次爬取的动态网页是一张壁纸网站，上面有精美的壁纸，我们的目的是通过爬虫把网站上的原壁纸下载到本地

　　网站网址：

　　知道网站是动态的网站，那么就需要通过抓取网站的js包以及它是如何获取数据来分析的。

　　打开网站时使用Charles获取请求。从json中找到有用的json数据，对比下载链接的url，发现下载链接的变化部分是图片的id，图片的id是从json中爬出来的。填写下载链接上的id部分，进行下载操作四、具体步骤是通过Charles打开网站时获取请求，如图

　　从图中不难发现，headers中有一个授权Client-ID参数，需要记下来，添加到我们自己的请求头中。（因为学习笔记，知道这个参数是反爬虫需要的参数。具体检测反爬虫操作，估计可以一一加参数试试）从中找到有用的json

　　存在

　　这里我们发现有一个图片ID。在网页上点击下载图片，从抓包中抓取下载链接，发现下载链接更改的部分是图片的id。

　　这样就可以确定爬取图片的具体步骤了。从json中爬取图片id，在下载链接上填写id部分，执行下载操作并分析，总结代码步骤

　　五、代码整理

　　代码步骤

　　1. 抓取图片id并保存到列表中

　　# -*- coding:utf-8 -*-

import requests,json

def get_ids():

# target_url = \'http://unsplash.com/napi/feeds/home\'

id_url = \'http://unsplash.com/napi/feeds/home\'

header = {

\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) \'

\'AppleWebKit/537.36 (KHTML, like Gecko) \'

\'Chrome/61.0.3163.79 Safari/537.36\',

\'authorization\': \'*************\'#此部分参数通过抓包获取

}

id_lists = []

# SSLerror 通过添加 verify=False来解决

try:

response = requests.get(id_url, headers=header, verify=False, timeout=30)

response.encoding = \'utf-8\'

print(response.text)

dic = json.loads(response.text)

# print(dic)

print(type(dic))

print("next_page:{}".format(dic[\'next_page\']))

for each in dic[\'photos\']:

# print("图片ID:{}".format(each[\'id\']))

id_lists.append(each[\'id\'])

print("图片id读取完成")

return id_lists

except:

print("图片id读取发生异常")

return False

if __name__==\'__main__\':

id_lists = get_ids()

if not id_lists is False:

for id in id_lists:

print(id)

　　结果如图，已经成功打印出图片ID

　　根据图片id下载图片

　　import os

from contextlib import closing

import requests

from datetime import datetime

def download(img_id):

header = {

\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) \'

\'AppleWebKit/537.36 (KHTML, like Gecko) \'

\'Chrome/61.0.3163.79 Safari/537.36\',

\'authorization\': \'***********\'#此参数需从包中获取

}

file_path = \'images\'

download_url = \'https://unsplash.com/photos/{}/download?force=true\'

download_url = download_url.format(img_id)

if file_path not in os.listdir():

os.makedirs(\'images\')

# 2种下载方法

# 方法1

# urlretrieve(download_url,filename=\'images/\'+img_id)

# 方法2 requests文档推荐方法

# response = requests.get(download_url, headers=self.header,verify=False, stream=True)

# response.encoding=response.apparent_encoding

chunk_size = 1024

with closing(requests.get(download_url, headers=header, verify=False, stream=True)) as response:

file = \'{}/{}.jpg\'.format(file_path, img_id)

if os.path.exists(file):

print("图片{}.jpg已存在,跳过本次下载".format(img_id))

else:

try:

start_time = datetime.now()

with open(file, \'ab+\') as f:

for chunk in response.iter_content(chunk_size=chunk_size):

f.write(chunk)

f.flush()

end_time = datetime.now()

sec = (end_time - start_time).seconds

print("下载图片{}完成,耗时:{}s".format(img_id, sec))

except:

if os.path.exists(file):

os.remove(file)

print("下载图片{}失败".format(img_id))

if __name__==\'__main__\':

img_id = \'vgpHniLr9Uw\'

download(img_id)

　　下载前

　　下载后

　　合码批量下载

　　# -*- coding:utf-8 -*-

import requests,json

from urllib.request import urlretrieve

import os

from datetime import datetime

from contextlib import closing

import time

class UnsplashSpider:

def __init__(self):

self.id_url = \'http://unsplash.com/napi/feeds/home\'

self.header = {

\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) \'

\'AppleWebKit/537.36 (KHTML, like Gecko) \'

\'Chrome/61.0.3163.79 Safari/537.36\',

\'authorization\': \'***********\'#此部分需要自行添加

}

self.id_lists = []

self.download_url=\'https://unsplash.com/photos/{}/download?force=true\'

print("init")

def get_ids(self):

# target_url = \'http://unsplash.com/napi/feeds/home\'

# target_url = \'https://unsplash.com/\'

#SSLerror 通过添加 verify=False来解决

try:

response = requests.get(self.id_url,headers=self.header,verify=False, timeout=30)

response.encoding = \'utf-8\'

# print(response.text)

dic = json.loads(response.text)

# print(dic)

print(type(dic))

print("next_page:{}".format(dic[\'next_page\']))

for each in dic[\'photos\']:

# print("图片ID:{}".format(each[\'id\']))

self.id_lists.append(each[\'id\'])

print("图片id读取完成")

return self.id_lists

except:

print("图片id读取发生异常")

return False

def download(self,img_id):

file_path = \'images\'

download_url = self.download_url.format(img_id)

if file_path not in os.listdir():

os.makedirs(\'images\')

# 2种下载方法

# 方法1

# urlretrieve(download_url,filename=\'images/\'+img_id)

# 方法2 requests文档推荐方法

# response = requests.get(download_url, headers=self.header,verify=False, stream=True)

# response.encoding=response.apparent_encoding

chunk_size=1024

with closing(requests.get(download_url, headers=self.header,verify=False, stream=True)) as response:

file = \'{}/{}.jpg\'.format(file_path,img_id)

if os.path.exists(file):

print("图片{}.jpg已存在,跳过本次下载".format(img_id))

else:

try:

start_time = datetime.now()

with open(file,\'ab+\') as f:

for chunk in response.iter_content(chunk_size = chunk_size):

f.write(chunk)

f.flush()

end_time = datetime.now()

sec = (end_time - start_time).seconds

print("下载图片{}完成,耗时:{}s".format(img_id,sec))

except:

print("下载图片{}失败".format(img_id))

if __name__==\'__main__\':

us = UnsplashSpider()

id_lists = us.get_ids()

if not id_lists is False:

for id in id_lists:

us.download(id)

#合理的延时,以尊敬网站

time.sleep(1)

　　六、结论

　　由于本文为学习笔记，中间省略了一些细节。

　　结合其他资料，发现爬取动态网站的关键点是抓包分析。只要能从包中分析出关键数据，就剩下写爬虫的步骤了

0

2021-11-21

动态网页抓取

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

动态网页抓取(关于with语句与上下文管理器三、思路整理(组图))

0 个评论

发起人