js 爬虫抓取网页数据(Python爬虫自动抓取互联网信息的程度与应用架构 )

优采云发布时间: 2022-03-12 14:19

　　js 爬虫抓取网页数据(Python爬虫自动抓取互联网信息的程度与应用架构

)

　　爬虫：一段时期内自动爬取互联网信息的程度，从互联网上爬取对我们有价值的信息

　　Python爬虫架构主要由五部分组成，分别是调度器、URL管理器、网页下载器、网页解析器和应用程序（爬取有价值的数据）。

　　调度器：相当于一台计算机的CPU，主要负责调度URL管理器、下载器、解析器之间的协调。

　　URL管理器：包括要爬取的URL地址和已经爬取的URL地址，防止URL重复爬取和URL循环爬取。实现 URL 管理器的方式主要有 3 种：内存、数据库和缓存数据库。

　　网页下载器：通过传入 URL 地址下载网页并将网页转换为字符串。网页下载器有urllib2（Python官方基础模块），包括需要登录、代理和cookies、requests（第三方包）

　　网页解析器：通过解析一个网页字符串，可以根据我们的需求提取我们有用的信息，或者按照DOM树的解析方式进行解析。网页解析器有正则表达式（直观地说，就是将网页转换成字符串，通过模糊匹配提取有价值的信息，当文档比较复杂时，这种方法提取数据会很困难），html。parser（Python自带的），beautifulsoup（第三方插件，可以用Python自带的html.parser解析，也可以用lxml，比别人更强大），lxml（第三方插件），可以解析xml和HTML），html.parser和beautifulsoup和lxml都是用DOM树的方式解析的。

　　应用程序：它是由从网页中提取的有用数据组成的应用程序。

　　先安装beautifulsoup

　　Beautiful Soup：Python的第三方插件，用于从xml和HTML中提取数据，官网地址

　　pip install beautifulsoup4（在 cmd 命令提示符下执行此代码）

　　1.爬虫第一个入门程序

　　from bs4 import BeautifulSoup

import urllib.request

#定义URL

url = "http://www.baidu.com"

#访问url

response = urllib.request.urlopen(url)

#将结果存入字符串中

ret = response.read()

#获取响应状态码

print(response.getcode())

print(ret)

#创建一个BeautifulSoup的对象

soup = BeautifulSoup(ret,"html.parser",from_encoding="utf-8")

# #获取所有的a链接

# links = soup.find_all('a')

# #遍历每一个a链接

# for link in links:

# print(link.name,link['href'],link.get_text())

p = soup.find_all('p')

for ps in p:

print(ps.get_text())

　　2.1 爬虫程序添加数据

　　import urllib.parse

from urllib import request

#定义参数

values={"username":"","password":""}

#参数编码

data = urllib.parse.urlencode(values).encode(encoding="UTF8")

#定义URL

# url = "http://passport.csdn.net/login?code=applets";

url = "http://mail.qq.com/";

#构造request请求

req = request.Request(url,data=data)

#打开网页

resp = request.urlopen(req)

print(resp.read())

　　2.2爬虫程序添加header

　　import urllib

from urllib import request

url = "http://www.zhihu.com/signin?next=%2F"

# 请求头的内容

user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"

#表单的请求参数

valuse={'username':'','password':''}

data = urllib.parse.urlencode(valuse).encode(encoding='UTF8')

# 构造请求头headers

headers={'User-Agent':user_agent,'Refere':'http://www.zhihu.com/signin?next=%2F'}

# 构造请求

req = request.Request(url,data = data,headers = headers)

# 打开网页

resp = request.urlopen(req)

# 读取网页内容

print(resp.read())

　　2.3个爬虫添加post请求

　　import urllib

from urllib import request

url = "http://www.zhihu.com/signin?next=%2F"

# 请求头的内容

user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"

#表单的请求参数

valuse={'username':'','password':''}

data = urllib.parse.urlencode(valuse).encode(encoding='UTF8')

# 构造请求头headers

headers={'User-Agent':user_agent,'Refere':'http://www.zhihu.com/signin?next=%2F'}

# 构造请求

req = request.Request(url,data = data,headers = headers)

# 打开网页

resp = request.urlopen(req)

# 读取网页内容

print(resp.read())

　　3.爬虫添加cookie

　　from http import cookiejar

from urllib import request

#设置保存cookie的文件，同级目录下的cookie.txt

filename = 'cookie.txt'

#声明一个MozillaCookieJar对象实例来保存cookie到文件

cookie = cookiejar.MozillaCookieJar(filename)

#利用request库的HTTPCookieProcessor对象来创建cookie处理器

handler = request.HTTPCookieProcessor(cookie)

#通过handler来构建opener

opener = request.build_opener(handler)

#创建一个请求

response = opener.open("http://www.baidu.com")

#保存cookie到文件

#ignore_discard：cookie失效了也要保存

#ignore_expires：覆盖保存

cookie.save(ignore_discard=True,ignore_expires=True)

　　#利用cookie登录网站

from urllib import request

import urllib

from http import cookiejar

#定义文件名

filename = 'cookie02.txt'

#声明MozillacCookieJar对象保存cookie

cookie = cookiejar.MozillaCookieJar(filename)

#声明一个cookie处理器

handler = request.HTTPCookieProcessor(cookie)

#定义处理

opener = request.build_opener(handler)

#定义data、账号、密码

postdata = urllib.parse.urlencode({

'username':'202042502008',

'password':'HHF2714596503'

}).encode(encoding='UTF8')

#登录

loginUrl = "http://jwc.hnshzy.cn:90/hnshjw/cas/login.action"

#模拟登录

result = opener.open(loginUrl,postdata)

#保存cookie到文件

cookie.save(ignore_discard=True,ignore_expires=True)

#利用保存的cookie请求新网站

ner_url = 'http://jwc.hnshzy.cn:90/hnshjw/cas/login.action'

#请求新网页

try:

result = opener.open(ner_url)

except request.HTTPError as e:

if hasattr(e,'code'):

print(e.code)

except request.URLError as e:

if hasattr(e,'reason'):

print(e.reason)

else:

print(result.read())

　　4.正则表达式

　　from urllib import request

from bs4 import BeautifulSoup

import re

#定义正则表达式;r表示原生字符串

pattern = re.compile(r'hello')

#匹配字符,match函数:会从第一个开始匹配

result1 = re.match(pattern,"hello,world")

if result1:

print(result1)

result1 = re.match(pattern,"hello,world").span()

if result1:

print(result1)

result1 = re.match(pattern,"hello,world").group()

if result1:

print(result1)

result2 = re.match(pattern,"hell,hello")

if result2:

print(result2)

else:

print("no!no!")

result2 = re.search(pattern,"hell,hello")

if result2:

print(result2)

#定义正则规则

pattern = re.compile(r'a.c')

#匹配

resp = re.match(pattern,"abcdefj")

print(resp)

pattern2 = re.compile('a\.c')

resp2 = re.match(pattern2,"a.cd")

print(resp2)

pattern3 = re.compile('a\\\c')

resp3 = re.match(pattern3,"a\cd")

print(resp3)

#匹配部分字符

pattern4 = re.compile(r'a[a-f,A-F]c')

resp4 = re.match(pattern4,"afcdefg")

print(resp4)

#定义正则规则

re01 = re.compile(r'\d*')

#匹配

res01 = re.match(re01,'123456xxxxxxxxx')

print(res01)

re02 = re.compile(r'\d+\w')

#匹配

res02 = re.match(re02,'123456xxxxxxxxx')

print(res02)

re03 = re.compile(r'\d?\w')

#匹配

res03 = re.match(re03,'4xxxxxxxxx')

print(res03)

re04 = re.compile(r'1\d{5}')

#匹配

res04 = re.match(re04,'123456xxxxxxxxx')

print(res04)

re05 = re.compile(r'\d{5,11}@\w{2}\.\w{3}')

#匹配

res05 = re.match(re05,'123456789@qq.com')

print(res05)

#贪婪模式

re06 = re.compile(r'\w+')