动态网页抓取(一个偶然的机会发现了花瓣网:美女图集_美女相册)

优采云 发布时间: 2021-09-24 02:12

  动态网页抓取(一个偶然的机会发现了花瓣网:美女图集_美女相册)

  偶然发现了花瓣网:美图集_美图相册_花瓣画板,这个神奇的网站,妹子图都很赞。这不是诱使我写一个爬虫来让事情发生。

  查了一下网页,发现这是一个动态加载的网站,真是可悲。不过没关系。如果分析请求规则,仍然可以抓取。

  分析网页的规则,每张图片的页面url都有一个pin_id,拿到pin_id就可以进入图片的详情页

  经过一番F12抓包,找到了请求url:

  http://huaban.com/boards/16115516/?jawjlvud&max=1339300078&limit=20&wfl=1

  这是在某个画板下继续请求的 url。可以发现max=是某张图片的pin_id。可以为空,返回前20个pin_id。

  

  从高容量行中取出json数据,可以看到:

  

  pin_id在json数据中,url中继续加载的max=参数是最后一个pin_id。那么就有一个思路:通过不断的请求,最后一个pin_id作为下一次请求的参数返回,就可以得到所有的pin_id。

  拿到了pin_id,进入分页页面的时候发现返回的html文字还是文字,但是无法获取图片。

  发现图片的下载链接格式如下:

  url = 'http://img.hb.aicdn.com/'+key+'_fw658'

  这个key收录在返回的html中的一个json格式字符串中,提取出来就可以了

  

  以下是完整代码,更新一次,可以直接返回json格式数据,不需要正则匹配,代码量小很多。

<p>import os

import requests

import jsonpath

headers = {&#39;User-Agent&#39;:&#39;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36&#39;

,&#39;Accept&#39;:&#39;application/json&#39;

,&#39;X-Request&#39;:&#39;JSON&#39;

,&#39;X-Requested-With&#39;:&#39;XMLHttpRequest&#39;}

headers_img = {&#39;User-Agent&#39;:&#39;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36&#39;}

def save_img(pin_id_list):

#pin_id_list = opener(file_name)

dir_name = input("enter the dir_name :")

#dir_name = "huaban"

try:

os.mkdir(dir_name)

except OSError as o:

print(o)

pass

os.chdir(&#39;.\\%s&#39; % dir_name)

for each in pin_id_list:

find_img(each)

#pin_id_list.remove(each)

print("%s is done."%each)

os.chdir(&#39;..&#39;)

def controlor(boards_id):

id_list = save_txt(boards_id)

save_img(id_list)

def save_txt(boards_id):

tuple_return = find_all_pin_id(boards_id)

#print(tuple_return)

id_list = tuple_return[0]

pin_count = tuple_return[1]

if len(id_list) == pin_count:

file_name = &#39;all_&#39;+str(boards_id)+".txt"

else:

lack_nummber = str(pin_count-len(id_list))

file_name = &#39;lack_&#39; + lack_nummber +"_"+ str(boards_id)+".txt"

with open(file_name,"wb")as f:

for each in id_list:

each = str(each) + &#39;;&#39;

txt = each.encode(encoding=&#39;utf-8&#39;)

f.write(txt)

print("写入boardsID完成")

return id_list

def find_img(pin_id):

url = &#39;http://huaban.com/pins/%s/?jaw2dlf8&#39; %pin_id

# print(url)

rsp = requests.get(url, headers=headers)

#print(rsp.text)

json_obj = rsp.json()

img_key = jsonpath.jsonpath(json_obj, "$..original_pin.file.key") # 这里就定位到了图片的key

img_type = jsonpath.jsonpath(json_obj, &#39;$..original_pin.file.type&#39;) # 顺便把图片的格式提取出来

img_id = jsonpath.jsonpath(json_obj, "$..pin_id")[0]

img_source = jsonpath.jsonpath(json_obj, "$..source")[0]

img_link = jsonpath.jsonpath(json_obj, "$..link")[0]

#print(img_source)

#print(img_link)

if img_key == False:

img_key = jsonpath.jsonpath(json_obj, "$..file.key") # 上一步返回的key有一些是False,因为original可能为空

img_type = jsonpath.jsonpath(json_obj, &#39;$..file.type&#39;) # 通过分析:有一些图片的key 在file目录下,那就改变一下提取出来

img_key = img_key[0]

img_type = img_type[0]

img_id = str(img_id)

i = img_type.index("/")

img_type = img_type[i + 1:]

# print(img_type + &#39;:&#39; + img_key +&#39;:&#39;+ img_id)

#return (img_key,img_id,img_type)

downloader(img_key,img_id,img_type)

def downloader(key,pin_id,type):

&#39;&#39;&#39;这是一个下载器,传入三个参数,构建url,得到图片,保存!&#39;&#39;&#39;

url = &#39;http://img.hb.aicdn.com/&#39;+key+&#39;_fw658&#39; # 构建url

try:

img = requests.get(url,headers=headers).content

img_name = str(pin_id)+&#39;.&#39;+type

print("---------------------------------------------------------------------------------")

print("正在下载图片:" + img_name)

print("下载链接:" + url)

with open(img_name,"wb")as f: # 写入文件

f.write(img)

except Exception as t:

with open("Error_logging.txt","wb")as w:

error = str(t)+"出错了!图片链接是:"+ url

w.write(error.encode(encoding="utf-8")) # 记录下错误日志

pass

def find_pin_id_20(pin_id,boards_id):

request_URL = &#39;http://huaban.com/boards/%s/?jbrvz3x1&max=%s&limit=20&wfl=1&#39;%(str(boards_id),str(pin_id))

print(request_URL)

json_obj = requests.get(request_URL,headers = headers).json()

#print(json_obj)

pin_id_list = jsonpath.jsonpath(json_obj, "$..pin_id") # jsonpath方法匹配出pinID

if pin_id_list != False:

#print("获取到的id个数: "+str(len(pin_id_list)))

int_list = []

for each in pin_id_list:

int_list.append(int(each))

#print(int_list)

return int_list

else:

return pin_id_list

def find_all_pin_id(boards_id):

url = &#39;http://huaban.com/boards/%s/&#39;%str(boards_id)

rsp = requests.get(url,headers=headers)

#print(rsp.text)

json_obj = rsp.json()

pin_count = jsonpath.jsonpath(json_obj, &#39;$..pin_count&#39;)[0]

pin_id_original = jsonpath.jsonpath(json_obj,&#39;$..pin_id&#39;)

#print(len(pin_id_original))

while True:

if len(pin_id_original)

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线