网页爬虫抓取百度图片(一个口罩识别360图片的过程过程及过程图分析)
优采云 发布时间: 2022-01-07 19:18网页爬虫抓取百度图片(一个口罩识别360图片的过程过程及过程图分析)
我目前在做一个口罩识别应用,需要很多戴口罩的人的图片作为数据训练模型。由于公司不提供数据,我们只能使用python爬虫爬取所有主流的网站戴口罩图片。我们主要爬取了必应和360、搜狗的图片(百度的有点复杂,不如这三个),代码如下(仅供学习参考):
必应搜索
from bs4 import BeautifulSoup
import urllib.request
import requests
import time
import json
import sys
import re
import os
#爬取目标网站url
CRAWL_TARGET_URL = 'https://cn.bing.com/images/async?q=%s&first=%d&count=%d&relp=%d&lostate=r&mmasync=1'
#每次抓取图片数量(35是此网页每次翻页请求数量)
NUMS_PER_CRAWL = 35
#抓取图片最小大小(单位字节),小于此值抛弃
MIN_IMAGE_SIZE = 10
def get_image(url, path, count):
try:
u = urllib.request.urlopen(url, timeout=5)
t = u.read()
if sys.getsizeof(t) < MIN_IMAGE_SIZE:
return -1
except Exception as e:
print(url, e)
return -2
#提取图片格式
frmt = url[url.rfind('.'):]
p = re.compile("^\\.[a-zA-Z]+")
m = p.match(frmt)
frmt = m.group(0)
try:
if not os.path.exists(path):
os.mkdir(path)
f = open(os.path.join(path, str(count)+frmt), 'wb')
f.write(t)
f.close()
except Exception as e:
print(os.path.join(path, str(count)+frmt), e)
return -3
return 0
def crawl_data(info, num):
first = 0
count = 0
#创建一个会话
s = requests.Session()
#创建文件路径
path="./"+info
if not os.path.exists(path):
os.mkdir(path)
index=len(os.listdir(path))#文件中原有图片数
while(count < num):
u = CRAWL_TARGET_URL%(info, first, NUMS_PER_CRAWL, NUMS_PER_CRAWL)
#3.05s为发送超时时间,10s为接收到数据超时时间
req = s.get(url =u, timeout=(3.05, 10))
bf = BeautifulSoup(req.text, "html.parser")
imgtags = bf.find_all("a", class_ = "iusc")
for e in imgtags:
if count == num:
return False
urldict = json.loads(e.get('m'))
if get_image(urldict["murl"], path, index) < 0:
continue
print("Downloaded %d picture"%(count+1))
sys.stdout.flush()
count =count+1
index=index+1
time.sleep(0.01)
first = first + NUMS_PER_CRAWL
time.sleep(0.1)
return True
if __name__ == '__main__':
# 关键词,可设置为多个
key_words=['戴口罩',]
# 下载的图片数量
picture_num = 10000
for i in range(len(key_words)):
word=key_words[i]
print(word)
if crawl_data(word, picture_num):
i=i+1
360搜索
import json
import os
import requests
# 路径
BASE_URL = './戴口罩'
# 关键词
NAME = '戴口罩'
class PictureDownload(object):
def __init__(self, q=None, sn=100):
self.url = 'https://m.image.so.com/j?q={}&src=srp&pn=100&sn={}&kn=0&gn=0&cn=0'
self.headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
self.q = q
self.sn = sn
self.num = 0
self.total = 2
def makedir(self):
if not os.path.exists(os.path.join(BASE_URL, self.q)):
os.makedirs(os.path.join(BASE_URL, self.q))
def parse_url(self):
response = requests.get(self.url.format(self.q, self.num), headers=self.headers)
return response.content.decode()
def parse_image_list(self, html_json_str):
image_list = json.loads(html_json_str)['list']
total = json.loads(html_json_str)['total']
return image_list, total
def save_image(self, image_list):
for item in image_list:
response = requests.get(item['thumb'], headers=self.headers)
with open(os.path.join(BASE_URL, '%s\%s.jpg' % (self.q, item['index'])), 'wb') as f:
f.write(response.content)
def run(self):
self.makedir()
while self.num < self.total:
html_json_str = self.parse_url()
image_list, self.total = self.parse_image_list(html_json_str)
self.save_image(image_list)
self.num += 100
print(self.num)
if __name__ == '__main__':
xxx = PictureDownload(NAME)
xxx.run()
搜狗搜索
抓取360图片的过程如图1所示:
图1 360图片抓取*敏*感*词*
我们可以看到,用pycharm运行程序后,图片开始一张一张的下载。当然,有些图片会干扰数据,需要手动清理。比起一一下载,方便多了。