ajax抓取网页内容( 我们想要的图片信息利用正则表达式取到链接进一步抓取到图片 )
优采云 发布时间: 2021-12-03 07:00ajax抓取网页内容(
我们想要的图片信息利用正则表达式取到链接进一步抓取到图片
)
并且数据收录了我们需要的信息
接下来我们需要获取这个详细页面的url:
def parse_page_index(html):
try:
data = json.loads(html) //json无法直接读取所以将json转换成dict
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
现在我们可以开始抓取图片了
抓一张图
接下来开始抓取图片,打开图集详情页,查看元素,我们点击Doc,下图中的下划线就是我们想要的图片信息
使用正则表达式获取链接
images_pattern = re.compile('gallery: JSON.parse\((.*?)\),',re.S)
result = re.search(images_pattern,html)
进一步抓取sub_images中的url,这个url就是我们需要的图片
if result:
data = json.loads(result.group(1))
data = json.loads(data)
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
顺便说一下,我们使用 BeautifulSoup 的 CSS 选择器来输出图集的标题
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()
这样我们就成功抓取到图片了,下一步就是下载图片了
下载并保存
我们使用抓图的URL存储在本地数据库中,这里我使用的是MonGoDB
首先我们创建数据库,
和链接
然后我们开始下载图片并以'jpg'格式保存到本地
def save_images(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
为了加快速度,我们可以引入Pool来使用多线程下载
from multiprocessing import Pool
pool = Pool()
pool.map(main,groups)
最后我们可以看到效果是这样的
完整的代码附在下面:
#coding:utf-8
import os
from hashlib import md5
import pymongo
import re
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import requests
import json
from config import *
from multiprocessing import Pool
from json.decoder import JSONDecodeError
client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]
def get_page_index(offset,keyword):
data={
'offset': offset ,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 1,
'from':'search_tab'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求检索页出错',url)
return None
def parse_page_index(html):
try:
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass
def get_page_detail(url):
try:
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求详情页出错',url)
return None
def parse_page_detail(html,url):
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()
print(title)
images_pattern = re.compile('gallery: JSON.parse\((.*?)\),',re.S)
result = re.search(images_pattern,html)
if result:
data = json.loads(result.group(1))
data = json.loads(data)
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images :
download_image(image)
return {
'title':title ,
'url':url ,
'images':images ,
}
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('存储到MongoDB成功',result)
return True
return False
def download_image(url):
print('正在下载图片',url)
try:
response = requests.get(url)
if response.status_code == 200:
save_images(response.content)
return None
except RequestException:
print('请求图片失败',url)
return None
def save_images(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
def main(offset):
html = get_page_index(offset,'世界杯美女')
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html,url)
if result:
save_to_mongo(result)
if __name__ == '__main__' :
groups = [ x * 20 for x in range(group_strat,group_end + 1 )]
pool = Pool()
pool.map(main,groups)
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
group_strat = 1
group_end = 20