输入关键字抓取所有网页(零基础入门Python的先从python基础学起，打好基础 )

优采云发布时间: 2022-03-19 09:13

　　输入关键字抓取所有网页(零基础入门Python的先从python基础学起，打好基础

)

　　文章目录

　　温馨提示：以下为本文正文内容文章，以下案例供参考

　　前言：

　　从零基础上手Python，先从Python的基础开始，打好基础，再慢慢接触下面的内容，给自己找任务，多做练习。也建议大家在学习Python的时候多写多读。

　　网页分析

　　节省时间，废话不多说，直接上例子！！！

　　让我们用 requests 和 BeautifulSoup 写一个，每个人都有自己喜欢的车！（包括我也是）所以本章我们将车辆信息存储在本地，浏览汽车的价格和状况，有钱就买。

　　ok，以下是开始学习python的正确姿势，请用电脑操作。

　　首先，我们需要打开网址，进行网页分析，了解网页可以实现哪些方法。

　　进入网站后，我们点击买车，就可以看到该车的所有信息。

　　判断网页是动态的还是静态的还是一样的。可以在网页源码中输入关键字找到，说明是静态的，那么我们就可以用常规的方法来实现URL了。

　　每页显示 48 辆汽车。当我们点击下一页的时候，可以看到地址变了，

　　page=2#pagetag

　　page=3#pagetag

　　page=4#pagetag

　　可以观察到page=2中的数字代表翻页。后面翻页的时候，我们可以直接使用循环变量翻页来获取不同页面的内容。

　　明确我们需要的信息是

　　车名

　　价钱

　　封面图片

　　里程

　　引擎

　　移位

　　播放时间

　　通过页面源码我们可以了解到这些信息是放在ul标签的li标签中的。

　　然后我们可以发起一个请求，使用 BeautifulSoup 来提取我们需要的信息。分析完成后，我们就可以开始操作了。

　　大意

　　1 使用request发起请求

　　2 编写循环变量实现翻页

　　3 然后为 BeautifulSoup 解析返回的 HTML

　　4 解析后，将内容保存到文件中

　　实施步骤

　　导入对应的库后，构造一个fake header，防止简单的反爬，然后向URL发起请求，返回文本。

　　import requests

from bs4 import BeautifulSoup

import xlwt

import os

import re

#

def get_content(url):

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}

try:

response = requests.get(url,headers)

response.encoding = response.apparent_encoding # 自动转码

if response.status_code == 200:

return response.text

except requests.RequestException as e:

print('error',e)

return None

　　拿到网页源代码后，我们可以使用BeautifulSoup解析来解析内容，找到所有的汽车标签li，循环每个li使用BeautifulSoup标签和属性找到我们需要的内容，得到详情页，然后进入提取内容的详细信息页面。

　　def get_data(response):

# 实例化对象

soup = BeautifulSoup(response,'lxml')

# 全部li标签

all_data = soup.find('ul',class_="gongge_ul").find_all('li')

for i in all_data:

title = i.find('div',class_="gongge_main").find('span').text

images = i.find('div',class_="item_img").find('img').get('data-src')

gl = i.find('div',class_="gongge_main").find_all('i')[1].text

# 详情页

ditail = i.find('a').get('href')

　　这是汽车详情页面

　　如果有详情页，我们会向详情页发起请求，进入详情页提取内容，同时也使用网页源代码的标签和属性来提取内容。进入详情页提取我们的信息后，调用其他参数传递参数。

　　# 详情页

ditail_data = requests.get(url=ditail).text

soup_li = BeautifulSoup(ditail_data,'lxml')

for xt in soup_li.find_all(class_="detail-wrapper"):

momey = xt.find(class_="price-this").text

ml = xt.find(class_="col-xs-6 parameter-configure-list").find_all('li')[2].text.replace('\n','').replace(' ','')

pl = xt.find(class_="summary-attrs").find_all('dl')[2].text

time = xt.find('div',class_="summary-attrs").find('dl').text.replace('上牌时间','')

content = xt.find('div',class_="row parameter-configure").text.replace('\n','').replace(' ','')

item = {

'标题':title,

'图片':images,

'里程':gl,

'价格':momey,

'情况':ml,

'万里':pl,

'时间':time,

'详情':content

}

print(item)

save_CSV(title,images,gl,momey,ml,pl,time,content)

save_Images(title,images)

　　信息提取完成后，将数据保存到Exel表中

　　def save_CSV(title,images,gl,momey,ml,pl,time,content):

global n

sheet.write(n, 0, title)

sheet.write(n, 1, images)

sheet.write(n, 2, gl)

sheet.write(n, 3, momey)

sheet.write(n, 4, ml)

sheet.write(n, 5, pl)

sheet.write(n, 6, time)

sheet.write(n, 7, content)

n = n + 1

book.save(u'汽车.xlsx')

print('正在保存===>:',title)

　　最后一件事是将封面保存到本地。

　　def save_Images(title,images):

if not os.path.exists('tche'):

os.mkdir('tche') # 创建文件夹

images_data = requests.get(url=images).content

big = '[?/\|:"*]' # 剔除特殊字符

li = re.sub(big,"",title)

with open('tche\\' + li + '.jpg', mode='wb')as f:

f.write(images_data)

print('正在保存图片=====>:',title)

　　打印结果

　　这里实现了一个循环变量来翻页。（需要更多页的可变数字）

　　这里我们将汽车信息保存到Exel。

　　完成

　　实现所有代码：

　　import requests

from bs4 import BeautifulSoup

import xlwt

import os

import re

# 发送请求

def get_content(url):

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}

try:

response = requests.get(url,headers)

response.encoding = response.apparent_encoding # 自动转码

if response.status_code == 200:

return response.text

except requests.RequestException as e:

print('error',e)

return None

# 打开xlwt

book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('汽车', cell_overwrite_ok=True)

sheet.write(0, 0, '名称')

sheet.write(0, 1, '图片')

sheet.write(0, 2, '万里')

sheet.write(0, 3, '价格')

sheet.write(0, 4, '马力')

sheet.write(0, 5, '排量')

sheet.write(0, 6, '上牌时间')

sheet.write(0, 7, '详情信息')

n = 1

def get_data(response):

#

soup = BeautifulSoup(response,'lxml')

# 找li

all_data = soup.find('ul',class_="gongge_ul").find_all('li')

for i in all_data:

title = i.find('div',class_="gongge_main").find('span').text

images = i.find('div',class_="item_img").find('img').get('data-src')

gl = i.find('div',class_="gongge_main").find_all('i')[1].text

#

ditail = i.find('a').get('href')

#

ditail_data = requests.get(url=ditail).text

soup_li = BeautifulSoup(ditail_data,'lxml')

for xt in soup_li.find_all(class_="detail-wrapper"):

momey = xt.find(class_="price-this").text

ml = xt.find(class_="col-xs-6 parameter-configure-list").find_all('li')[2].text.replace('\n','').replace(' ','')

pl = xt.find(class_="summary-attrs").find_all('dl')[2].text

time = xt.find('div',class_="summary-attrs").find('dl').text.replace('上牌时间','')

content = xt.find('div',class_="row parameter-configure").text.replace('\n','').replace(' ','')

item = {

'标题':title,

'图片':images,

'里程':gl,

'价格':momey,

'情况':ml,

'万里':pl,

'时间':time,

'信息':content

}

print(item)

save_CSV(title,images,gl,momey,ml,pl,time,content)

save_Images(title,images)

def save_CSV(title,images,gl,momey,ml,pl,time,content):

global n

sheet.write(n, 0, title)

sheet.write(n, 1, images)

sheet.write(n, 2, gl)

sheet.write(n, 3, momey)

sheet.write(n, 4, ml)

sheet.write(n, 5, pl)

sheet.write(n, 6, time)

sheet.write(n, 7, content)

n = n + 1

book.save(u'二手车.xlsx')

print('正在保存===>:',title)

def save_Images(title,images):

if not os.path.exists('che'):

os.mkdir('che') # 创建文件夹

images_data = requests.get(url=images).content

big = '[?/\|:"*]' # 剔除特殊字符

li = re.sub(big,"",title)

with open('che\\' + li + '.jpg', mode='wb')as f:

f.write(images_data)

print('正在保存图片=====>:',title)

def main(page):

url = '。。。。。。。。。'

print(f'==============================正在保存第{page}页的数据内容==============================')

response = get_content(url)

get_data(response)

if __name__ == '__main__':

for i in range(1,6):

main(page=i)

0

2022-03-19

输入关键字抓取所有网页

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

输入关键字抓取所有网页(零基础入门Python的先从python基础学起，打好基础 )

0 个评论

发起人

AI时代内容工厂

输入关键字 抓取所有网页(零基础入门Python的先从python基础学起，打好基础 )

0 个评论

发起人

输入关键字抓取所有网页(零基础入门Python的先从python基础学起，打好基础 )