java爬虫抓取网页数据(Java爬虫采集网页数据(简单介绍)(1)_ )
优采云 发布时间: 2022-01-12 00:08java爬虫抓取网页数据(Java爬虫采集网页数据(简单介绍)(1)_
)
Java爬虫采集网页数据一、爬虫简介
网络爬虫(也称为网络蜘蛛、网络机器人,在 FOAF 社区中,更常被称为网页追逐者)是根据一定的规则自动从万维网上爬取信息的程序或脚本。
学过爬虫的同学都知道,目前80%的爬虫都是用Python写的:
原因一:由于目前大部分网络协议都是基于HTTP/HTTPS的,而java的基础框架支持TCP/IP网络协议,构建爬虫时需要导入大量底层库;
原因2:Python有很多开源爬虫库,好用,也有Java的,但是Java入门比较难;
理由三:Python语言简单难懂。相比之下,Java语言更复杂,理解难度也增加了;
好了,这次回到我们的话题,修改后的例子是一个基于JavaClient加正则化的爬虫来简单实现Java Maven项目采集的图片数据!
二、必需的 pom.xml 依赖项
org.jsoup
jsoup
1.8.3
commons-io
commons-io
2.5
org.apache.httpcomponents
httpclient
4.5.5
有同学创建Maven项目后,程序还是跑错了!只要三点修改,就会更宽!(基于 JDK1.8)
1.修改pom.xml依赖中的JDK版本号
UTF-8
1.8
1.8
2.根据下图找到项目*敏*感*词*标,进入Project Settings --> Modules -->Souces->Language level:设置为8;
3 进入项目设置文件,Settings–>Build, Execution, Deployment–>Compiler–>Java Compiler–>Moudle:配置JDK版本为8;
三点后就可以配置了
三.java代码(附详细注释)
因为我这里是一个简单的java爬虫,所以我只用了一个java文件写成静态方法,方便调用
爬取图片下载到本地
html.java
<p>import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.*;
import java.util.Scanner;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class scenery {
//编码格式
private static final String ENCODING = "UTF-8";
//保存地址
private static final String SAVE_PATH = "file/background";
/**
* 获取到指定网址的网页源码并返回
* @param url 爬取网址
* @return html
*/
public static String getHtmlResourceByUrl(String url) {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
HttpEntity httpEntity = null;
String html = null;
// 设置长连接
httpGet.setHeader("Connection", "keep-alive");
// 设置代理(模拟浏览器版本)
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
CloseableHttpResponse httpResponse = null;
System.out.println("开始请求网页!!!!!!!!");
try {
// 请求并获得响应结果
httpResponse = httpClient.execute(httpGet);
httpEntity = httpResponse.getEntity();
// 输出请求结果
html = EntityUtils.toString(httpEntity);
} catch (IOException e) {
e.printStackTrace();
}
return html;
}
/**
* 获取网页的链接与标题,并追加到list中,从而返回list
* @param html 网页地址
* @return list
*/
public static List getTitleUrl(String html){
String regex_img_url = "<img src=\"(.*?)\" alt="/spanspan class="token punctuation";/span
String regex_img_title span class="token operator"=/span span class="token string""div class=\"tits\"(.*?)b class=hightlight"/spanspan class="token punctuation";/span
ArrayListspan class="token generics function"span class="token punctuation"/spanStringspan class="token punctuation"/span/span list span class="token operator"=/span span class="token keyword"new/span span class="token class-name"ArrayList/spanspan class="token operator"/spanspan class="token operator"/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token comment"//创建pattern对象/span
Pattern img_url_p span class="token operator"=/span Patternspan class="token punctuation"./spanspan class="token function"compile/spanspan class="token punctuation"(/spanregex_img_urlspan class="token punctuation")/spanspan class="token punctuation";/span
Pattern img_title_p span class="token operator"=/span Patternspan class="token punctuation"./spanspan class="token function"compile/spanspan class="token punctuation"(/spanregex_img_titlespan class="token punctuation")/spanspan class="token punctuation";/span
span class="token comment"//创建matcher对象/span
Matcher img_url_m span class="token operator"=/span img_url_pspan class="token punctuation"./spanspan class="token function"matcher/spanspan class="token punctuation"(/spanhtmlspan class="token punctuation")/spanspan class="token punctuation";/span
Matcher img_title_m span class="token operator"=/span img_title_pspan class="token punctuation"./spanspan class="token function"matcher/spanspan class="token punctuation"(/spanhtmlspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token keyword"while/span span class="token punctuation"(/spanimg_url_mspan class="token punctuation"./spanspan class="token function"find/spanspan class="token punctuation"(/spanspan class="token punctuation")/span span class="token operator"&&/span img_title_mspan class="token punctuation"./spanspan class="token function"find/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation")/span span class="token punctuation"{/span
String url span class="token operator"=/span img_url_mspan class="token punctuation"./spanspan class="token function"group/spanspan class="token punctuation"(/spanspan class="token number"1/spanspan class="token punctuation")/spanspan class="token punctuation";/span
listspan class="token punctuation"./spanspan class="token function"add/spanspan class="token punctuation"(/spanurlspan class="token punctuation")/spanspan class="token punctuation";/span
String title span class="token operator"=/span img_title_mspan class="token punctuation"./spanspan class="token function"group/spanspan class="token punctuation"(/spanspan class="token number"1/spanspan class="token punctuation")/spanspan class="token punctuation";/span
listspan class="token punctuation"./spanspan class="token function"add/spanspan class="token punctuation"(/spantitlespan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span
span class="token keyword"return/span listspan class="token punctuation";/span
span class="token punctuation"}/span
span class="token comment"/**
* 获取image url 追加到List中,并返回List
* @param details_html 详情页网址
* @return List
*//span
span class="token keyword"public/span span class="token keyword"static/span Listspan class="token generics function"span class="token punctuation"/spanStringspan class="token punctuation"/span/span span class="token function"getImageSrc/spanspan class="token punctuation"(/spanString details_htmlspan class="token punctuation")/spanspan class="token punctuation"{/span
Listspan class="token generics function"span class="token punctuation"/spanStringspan class="token punctuation"/span/span list span class="token operator"=/span span class="token keyword"new/span span class="token class-name"ArrayList/spanspan class="token operator"/spanspan class="token operator"/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
String imgRegex span class="token operator"=/span span class="token string""img src=\"(.*?)\" alt="/spanspan class="token punctuation";/span
span class="token comment"//创建Pattern对象/span
Pattern img_p span class="token operator"=/span Patternspan class="token punctuation"./spanspan class="token function"compile/spanspan class="token punctuation"(/spanimgRegexspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token comment"//创建matcher对象/span
Matcher img_m span class="token operator"=/span img_pspan class="token punctuation"./spanspan class="token function"matcher/spanspan class="token punctuation"(/spandetails_htmlspan class="token punctuation")/spanspan class="token punctuation";/span
Systemspan class="token punctuation"./spanoutspan class="token punctuation"./spanspan class="token function"println/spanspan class="token punctuation"(/spanspan class="token string""开始解析..."/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token keyword"while/span span class="token punctuation"(/spanimg_mspan class="token punctuation"./spanspan class="token function"find/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation")/spanspan class="token punctuation"{/span
listspan class="token punctuation"./spanspan class="token function"add/spanspan class="token punctuation"(/spanimg_mspan class="token punctuation"./spanspan class="token function"group/spanspan class="token punctuation"(/spanspan class="token number"1/spanspan class="token punctuation")/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span
span class="token keyword"return/span listspan class="token punctuation";/span
span class="token punctuation"}/span
span class="token comment"/**
* 下载图片
* @param imgUrl img网址
* @param filePath 图片报错地址
* @param title 图片系列
* @param imageName 图片名
* @param page 页数
* @param count 每页的图片计数
*//span
span class="token keyword"public/span span class="token keyword"static/span span class="token keyword"void/span span class="token function"downLoad/spanspan class="token punctuation"(/spanString imgUrlspan class="token punctuation",/spanString filePathspan class="token punctuation",/span String titlespan class="token punctuation",/span String imageNamespan class="token punctuation",/spanspan class="token keyword"int/span pagespan class="token punctuation",/span span class="token keyword"int/span countspan class="token punctuation")/span span class="token punctuation"{/span
CloseableHttpClient httpClient span class="token operator"=/span HttpClientsspan class="token punctuation"./spanspan class="token function"createDefault/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
HttpGet httpGet span class="token operator"=/span span class="token keyword"new/span span class="token class-name"HttpGet/spanspan class="token punctuation"(/spanimgUrlspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token keyword"try/span span class="token punctuation"{/span
CloseableHttpResponse response span class="token operator"=/span httpClientspan class="token punctuation"./spanspan class="token function"execute/spanspan class="token punctuation"(/spanhttpGetspan class="token punctuation")/spanspan class="token punctuation";/span
Systemspan class="token punctuation"./spanoutspan class="token punctuation"./spanspan class="token function"println/spanspan class="token punctuation"(/spanspan class="token string""第"/span span class="token operator"+/spanpagespan class="token operator"+/span span class="token string""页的"/span span class="token operator"+/span title span class="token operator"+/span span class="token string""系列图片开始下载:"/span span class="token operator"+/span imgUrlspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token keyword"if/span span class="token punctuation"(/spanHttpStatusspan class="token punctuation"./spanSC_OK span class="token operator"==/span responsespan class="token punctuation"./spanspan class="token function"getStatusLine/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation"./spanspan class="token function"getStatusCode/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation")/span span class="token punctuation"{/span
HttpEntity entity span class="token operator"=/span responsespan class="token punctuation"./spanspan class="token function"getEntity/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
InputStream imgContent span class="token operator"=/span entityspan class="token punctuation"./spanspan class="token function"getContent/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token function"saveImage/spanspan class="token punctuation"(/spanimgContentspan class="token punctuation",/span filePathspan class="token punctuation",/spanimageNamespan class="token punctuation")/spanspan class="token punctuation";/span
Systemspan class="token punctuation"./spanoutspan class="token punctuation"./spanspan class="token function"println/spanspan class="token punctuation"(/spanspan class="token string""第"/span span class="token operator"+/span span class="token punctuation"(/spancount span class="token operator"+/span span class="token number"1/spanspan class="token punctuation")/span span class="token operator"+/span span class="token string""张图片下载完成名为:"/span span class="token operator"+/span imageNamespan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span
span class="token punctuation"}/span span class="token keyword"catch/span span class="token punctuation"(/spanspan class="token class-name"ClientProtocolException/span espan class="token punctuation")/span span class="token punctuation"{/span
espan class="token punctuation"./spanspan class="token function"printStackTrace/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span span class="token keyword"catch/span span class="token punctuation"(/spanspan class="token class-name"IOException/span espan class="token punctuation")/span span class="token punctuation"{/span
espan class="token punctuation"./spanspan class="token function"printStackTrace/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span
span class="token punctuation"}/span
span class="token comment"/**
* 保存图片
* @param is 输入数据流
* @param filePath 文件目录Path
* @param imageName image名
*//span
span class="token keyword"public/span span class="token keyword"static/span span class="token keyword"void/span span class="token function"saveImage/spanspan class="token punctuation"(/spanInputStream isspan class="token punctuation",/span String filePathspan class="token punctuation",/span String imageNamespan class="token punctuation")/spanspan class="token punctuation"{/span
span class="token keyword"try/span span class="token punctuation"{/span
span class="token comment"//创建图片文件/span
String imgSavePath span class="token operator"=/span filePathspan class="token punctuation"./spanspan class="token function"concat/spanspan class="token punctuation"(/spanspan class="token string""/"/span span class="token operator"+/span imageName span class="token operator"+/span span class="token string"".jpg"/spanspan class="token punctuation")/spanspan class="token punctuation";/span
File imgPath span class="token operator"=/span span class="token keyword"new/span span class="token class-name"File/spanspan class="token punctuation"(/spanimgSavePathspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token keyword"if/span span class="token punctuation"(/spanspan class="token operator"!/spanimgPathspan class="token punctuation"./spanspan class="token function"exists/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation")/span span class="token punctuation"{/span
imgPathspan class="token punctuation"./spanspan class="token function"createNewFile/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span
FileOutputStream fos span class="token operator"=/span span class="token keyword"new/span span class="token class-name"FileOutputStream/spanspan class="token punctuation"(/spanimgPathspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token keyword"byte/spanspan class="token punctuation"[/spanspan class="token punctuation"]/span bytes span class="token operator"=/span span class="token keyword"new/span span class="token class-name"byte/spanspan class="token punctuation"[/spanspan class="token number"1024/span span class="token operator"*/span span class="token number"1024/span span class="token operator"*/span span class="token number"1024/spanspan class="token punctuation"]/spanspan class="token punctuation";/span
span class="token keyword"int/span len span class="token operator"=/span span class="token number"0/spanspan class="token punctuation";/span
span class="token keyword"while/span span class="token punctuation"(/spanspan class="token punctuation"(/spanlen span class="token operator"=/span isspan class="token punctuation"./spanspan class="token function"read/spanspan class="token punctuation"(/spanbytesspan class="token punctuation")/spanspan class="token punctuation")/span span class="token operator"!=/span span class="token operator"-/spanspan class="token number"1/spanspan class="token punctuation")/spanspan class="token punctuation"{/span
fosspan class="token punctuation"./spanspan class="token function"write/spanspan class="token punctuation"(/spanbytesspan class="token punctuation",/span span class="token number"0/spanspan class="token punctuation",/span lenspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span
fosspan class="token punctuation"./spanspan class="token function"flush/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
fosspan class="token punctuation"./spanspan class="token function"close/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span span class="token keyword"catch/span span class="token punctuation"(/spanspan class="token class-name"IOException/span espan class="token punctuation")/span span class="token punctuation"{/span
espan class="token punctuation"./spanspan class="token function"printStackTrace/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/spanspan class="token keyword"finally/span span class="token punctuation"{/span
span class="token keyword"try/spanspan class="token punctuation"{/span
isspan class="token punctuation"./spanspan class="token function"close/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span span class="token keyword"catch/span span class="token punctuation"(/spanspan class="token class-name"IOException/span espan class="token punctuation")/span span class="token punctuation"{/span
espan class="token punctuation"./spanspan class="token function"printStackTrace/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation";/span
span class="token punctuation"}/span
span class="token punctuation"}/span
span class="token punctuation"}/span
span class="token keyword"public/span span class="token keyword"static/span span class="token keyword"void/span span class="token function"run/spanspan class="token punctuation"(/spanspan class="token punctuation")/spanspan class="token punctuation"{/span
span class="token comment"//循环获取列表页的html/span
String title span class="token operator"=/span span class="token string"""/spanspan class="token punctuation";/span
span class="token comment"//采集类型大家阔以自行发挥!!!/span
Scanner input span class="token operator"=/span span class="token keyword"new/span span class="token class-name"Scanner/spanspan class="token punctuation"(/spanSystemspan class="token punctuation"./spaninspan class="token punctuation")/spanspan class="token punctuation";/span
Systemspan class="token punctuation"./spanoutspan class="token punctuation"./spanspan class="token function"println/spanspan class="token punctuation"(/spanspan class="token string""*********************欢迎来到洋群满满壁纸下载地,请选择你想要下载序列的序号!*********************"/spanspan class="token punctuation")/spanspan class="token punctuation";/span
Systemspan class="token punctuation"./spanoutspan class="token punctuation"./spanspan class="token function"println/spanspan class="token punctuation"(/spanspan class="token string""1>>>风景|2>>>美女|3>>>汽车|4>>>*敏*感*词*|5>>>二次元|6>>>森林|7>>>明星|8>>>猜你喜欢(You Know!!!)");
System.out.print("请选择:");
int choose = input.nextInt();
switch (choose){
case 1:
title = "风景";
break;
case 2:
title = "美女";
break;
case 3:
title = "汽车";
break;
case 4:
title = "*敏*感*词*";
break;
case 5:
title = "二次元";
break;
case 6:
title = "森林";
break;
case 7:
title = "明星";
break;
case 8:
title = "性感";
break;
default:
title = "风景";
System.out.println("选择错误,默认采集风景系列图片!!!");
break;
}
int page = 1;
for (; page