抓取动态网页(图片都是极好的,这岂不是诱惑着我爬虫)

优采云 发布时间: 2022-03-10 07:10

  抓取动态网页(图片都是极好的,这岂不是诱惑着我爬虫)

  一次偶然的机会,我发现了花瓣网:美女图集_美女相册_花瓣画板这个神奇的网站,妹子的图片都很棒。这不是诱使我写一个爬虫来弥补。

  很遗憾看到这是一个动态加载的 网站 页面检查。不过没关系,如果分析请求规则,还是可以爬取的。

  分析网页的规则,每张图片的分页url都有一个pin_id,得到pin_id,就可以进入图片的详情页

  F12抓包后,找到了请求的url:

  http://huaban.com/boards/16115516/?jawjlvud&max=1339300078&limit=20&wfl=1

  这是某个画板下不断请求的url。可以发现,某张图片的pin_id后面跟着max=。如果可以为空,则返回前 20 个 pin_id。

  

  从高行数中取出json数据,可以看到:

  

  pin_id在json数据中,最后一个pin_id在继续加载的url中max=参数之后。然后有一个想法:通过不断的请求,将最后返回的pin_id作为下一次请求的参数,就可以得到所有的pin_id。

  获取pin_id,进入分页,发现返回的html文本还在,但是图片没有

  找到的图片下载链接格式如下:

  url = 'http://img.hb.aicdn.com/'+key+'_fw658'

  该键收录在返回的 html 中的一个 json 格式的字符串中,提取它就可以了。

  

  下面是完整的代码,已经更新过一次,可以直接返回json格式的数据,无需正则匹配,代码量少很多。

<p>import os

import requests

import jsonpath

headers = {&#39;User-Agent&#39;:&#39;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36&#39;

,&#39;Accept&#39;:&#39;application/json&#39;

,&#39;X-Request&#39;:&#39;JSON&#39;

,&#39;X-Requested-With&#39;:&#39;XMLHttpRequest&#39;}

headers_img = {&#39;User-Agent&#39;:&#39;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36&#39;}

def save_img(pin_id_list):

#pin_id_list = opener(file_name)

dir_name = input("enter the dir_name :")

#dir_name = "huaban"

try:

os.mkdir(dir_name)

except OSError as o:

print(o)

pass

os.chdir(&#39;.\\%s&#39; % dir_name)

for each in pin_id_list:

find_img(each)

#pin_id_list.remove(each)

print("%s is done."%each)

os.chdir(&#39;..&#39;)

def controlor(boards_id):

id_list = save_txt(boards_id)

save_img(id_list)

def save_txt(boards_id):

tuple_return = find_all_pin_id(boards_id)

#print(tuple_return)

id_list = tuple_return[0]

pin_count = tuple_return[1]

if len(id_list) == pin_count:

file_name = &#39;all_&#39;+str(boards_id)+".txt"

else:

lack_nummber = str(pin_count-len(id_list))

file_name = &#39;lack_&#39; + lack_nummber +"_"+ str(boards_id)+".txt"

with open(file_name,"wb")as f:

for each in id_list:

each = str(each) + &#39;;&#39;

txt = each.encode(encoding=&#39;utf-8&#39;)

f.write(txt)

print("写入boardsID完成")

return id_list

def find_img(pin_id):

url = &#39;http://huaban.com/pins/%s/?jaw2dlf8&#39; %pin_id

# print(url)

rsp = requests.get(url, headers=headers)

#print(rsp.text)

json_obj = rsp.json()

img_key = jsonpath.jsonpath(json_obj, "$..original_pin.file.key") # 这里就定位到了图片的key

img_type = jsonpath.jsonpath(json_obj, &#39;$..original_pin.file.type&#39;) # 顺便把图片的格式提取出来

img_id = jsonpath.jsonpath(json_obj, "$..pin_id")[0]

img_source = jsonpath.jsonpath(json_obj, "$..source")[0]

img_link = jsonpath.jsonpath(json_obj, "$..link")[0]

#print(img_source)

#print(img_link)

if img_key == False:

img_key = jsonpath.jsonpath(json_obj, "$..file.key") # 上一步返回的key有一些是False,因为original可能为空

img_type = jsonpath.jsonpath(json_obj, &#39;$..file.type&#39;) # 通过分析:有一些图片的key 在file目录下,那就改变一下提取出来

img_key = img_key[0]

img_type = img_type[0]

img_id = str(img_id)

i = img_type.index("/")

img_type = img_type[i + 1:]

# print(img_type + &#39;:&#39; + img_key +&#39;:&#39;+ img_id)

#return (img_key,img_id,img_type)

downloader(img_key,img_id,img_type)

def downloader(key,pin_id,type):

&#39;&#39;&#39;这是一个下载器,传入三个参数,构建url,得到图片,保存!&#39;&#39;&#39;

url = &#39;http://img.hb.aicdn.com/&#39;+key+&#39;_fw658&#39; # 构建url

try:

img = requests.get(url,headers=headers).content

img_name = str(pin_id)+&#39;.&#39;+type

print("---------------------------------------------------------------------------------")

print("正在下载图片:" + img_name)

print("下载链接:" + url)

with open(img_name,"wb")as f: # 写入文件

f.write(img)

except Exception as t:

with open("Error_logging.txt","wb")as w:

error = str(t)+"出错了!图片链接是:"+ url

w.write(error.encode(encoding="utf-8")) # 记录下错误日志

pass

def find_pin_id_20(pin_id,boards_id):

request_URL = &#39;http://huaban.com/boards/%s/?jbrvz3x1&max=%s&limit=20&wfl=1&#39;%(str(boards_id),str(pin_id))

print(request_URL)

json_obj = requests.get(request_URL,headers = headers).json()

#print(json_obj)

pin_id_list = jsonpath.jsonpath(json_obj, "$..pin_id") # jsonpath方法匹配出pinID

if pin_id_list != False:

#print("获取到的id个数: "+str(len(pin_id_list)))

int_list = []

for each in pin_id_list:

int_list.append(int(each))

#print(int_list)

return int_list

else:

return pin_id_list

def find_all_pin_id(boards_id):

url = &#39;http://huaban.com/boards/%s/&#39;%str(boards_id)

rsp = requests.get(url,headers=headers)

#print(rsp.text)

json_obj = rsp.json()

pin_count = jsonpath.jsonpath(json_obj, &#39;$..pin_count&#39;)[0]

pin_id_original = jsonpath.jsonpath(json_obj,&#39;$..pin_id&#39;)

#print(len(pin_id_original))

while True:

if len(pin_id_original)

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线