网页爬虫抓取百度图片(一个口罩识别360图片的过程过程及过程图分析)

优采云 发布时间: 2022-01-07 19:18

  网页爬虫抓取百度图片(一个口罩识别360图片的过程过程及过程图分析)

  我目前在做一个口罩识别应用,需要很多戴口罩的人的图片作为数据训练模型。由于公司不提供数据,我们只能使用python爬虫爬取所有主流的网站戴口罩图片。我们主要爬取了必应和360、搜狗的图片(百度的有点复杂,不如这三个),代码如下(仅供学习参考):

  必应搜索

  

from bs4 import BeautifulSoup

import urllib.request

import requests

import time

import json

import sys

import re

import os

#爬取目标网站url

CRAWL_TARGET_URL = 'https://cn.bing.com/images/async?q=%s&first=%d&count=%d&relp=%d&lostate=r&mmasync=1'

#每次抓取图片数量(35是此网页每次翻页请求数量)

NUMS_PER_CRAWL = 35

#抓取图片最小大小(单位字节),小于此值抛弃

MIN_IMAGE_SIZE = 10

def get_image(url, path, count):

try:

u = urllib.request.urlopen(url, timeout=5)

t = u.read()

if sys.getsizeof(t) < MIN_IMAGE_SIZE:

return -1

except Exception as e:

print(url, e)

return -2

#提取图片格式

frmt = url[url.rfind(&#x27;.&#x27;):]

p = re.compile("^\\.[a-zA-Z]+")

m = p.match(frmt)

frmt = m.group(0)

try:

if not os.path.exists(path):

os.mkdir(path)

f = open(os.path.join(path, str(count)+frmt), &#x27;wb&#x27;)

f.write(t)

f.close()

except Exception as e:

print(os.path.join(path, str(count)+frmt), e)

return -3

return 0

def crawl_data(info, num):

first = 0

count = 0

#创建一个会话

s = requests.Session()

#创建文件路径

path="./"+info

if not os.path.exists(path):

os.mkdir(path)

index=len(os.listdir(path))#文件中原有图片数

while(count < num):

u = CRAWL_TARGET_URL%(info, first, NUMS_PER_CRAWL, NUMS_PER_CRAWL)

#3.05s为发送超时时间,10s为接收到数据超时时间

req = s.get(url =u, timeout=(3.05, 10))

bf = BeautifulSoup(req.text, "html.parser")

imgtags = bf.find_all("a", class_ = "iusc")

for e in imgtags:

if count == num:

return False

urldict = json.loads(e.get(&#x27;m&#x27;))

if get_image(urldict["murl"], path, index) < 0:

continue

print("Downloaded %d picture"%(count+1))

sys.stdout.flush()

count =count+1

index=index+1

time.sleep(0.01)

first = first + NUMS_PER_CRAWL

time.sleep(0.1)

return True

if __name__ == &#x27;__main__&#x27;:

# 关键词,可设置为多个

key_words=[&#x27;戴口罩&#x27;,]

# 下载的图片数量

picture_num = 10000

for i in range(len(key_words)):

word=key_words[i]

print(word)

if crawl_data(word, picture_num):

i=i+1

  360搜索

  import json

import os

import requests

# 路径

BASE_URL = &#x27;./戴口罩&#x27;

# 关键词

NAME = &#x27;戴口罩&#x27;

class PictureDownload(object):

def __init__(self, q=None, sn=100):

self.url = &#x27;https://m.image.so.com/j?q={}&src=srp&pn=100&sn={}&kn=0&gn=0&cn=0&#x27;

self.headers = {

&#x27;User-Agent&#x27;: &#x27;Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1&#x27;

}

self.q = q

self.sn = sn

self.num = 0

self.total = 2

def makedir(self):

if not os.path.exists(os.path.join(BASE_URL, self.q)):

os.makedirs(os.path.join(BASE_URL, self.q))

def parse_url(self):

response = requests.get(self.url.format(self.q, self.num), headers=self.headers)

return response.content.decode()

def parse_image_list(self, html_json_str):

image_list = json.loads(html_json_str)[&#x27;list&#x27;]

total = json.loads(html_json_str)[&#x27;total&#x27;]

return image_list, total

def save_image(self, image_list):

for item in image_list:

response = requests.get(item[&#x27;thumb&#x27;], headers=self.headers)

with open(os.path.join(BASE_URL, &#x27;%s\%s.jpg&#x27; % (self.q, item[&#x27;index&#x27;])), &#x27;wb&#x27;) as f:

f.write(response.content)

def run(self):

self.makedir()

while self.num < self.total:

html_json_str = self.parse_url()

image_list, self.total = self.parse_image_list(html_json_str)

self.save_image(image_list)

self.num += 100

print(self.num)

if __name__ == &#x27;__main__&#x27;:

xxx = PictureDownload(NAME)

xxx.run()

  搜狗搜索

  抓取360图片的过程如图1所示:

  图1 360图片抓取*敏*感*词*

  我们可以看到,用pycharm运行程序后,图片开始一张一张的下载。当然,有些图片会干扰数据,需要手动清理。比起一一下载,方便多了。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线