java爬虫抓取网页数据(自动在工程下创建Pictures文件夹)
优采云 发布时间: 2022-01-15 20:14java爬虫抓取网页数据(自动在工程下创建Pictures文件夹)
为达到效果,在项目下自动创建Pictures文件夹,根据网站URL爬取图片,逐层获取。图片下的分层URL为网站的文件夹命名,用于安装该层URL下的图片。同时将文件名、路径、URL插入数据库,方便索引。
第一步是创建持久层类来存储文件名、路径和URL。
package org.amuxia.demo;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class JDBCHelper {
private static final String driver = "com.mysql.jdbc.Driver";
private static final String DBurl = "jdbc:mysql://127.0.0.1:3306/edupic";
private static final String user = "root";
private static final String password = "root";
private PreparedStatement pstmt = null;
private Connection spiderconn = null;
public void insertFilePath(String fileName, String filepath, String url) {
try {
Class.forName(driver);
spiderconn = DriverManager.getConnection(DBurl, user, password);
String sql = "insert into FilePath (filename,filepath,url) values (?,?,?)";
pstmt = spiderconn.prepareStatement(sql);
pstmt.setString(1, fileName);
pstmt.setString(2, filepath);
pstmt.setString(3, url);
pstmt.executeUpdate();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
} finally {
try {
pstmt.close();
spiderconn.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
第二步,创建一个解析URL并爬取的类
<p> package org.amuxia.demo;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetWeb {
private int webDepth = 5; // 爬虫深度
private int intThreadNum = 1; // 线程数
private String strHomePage = ""; // 主页地址
private String myDomain; // 域名
private String fPath = "CSDN"; // 储存网页文件的目录名
private ArrayList arrUrls = new ArrayList(); // 存储未处理URL
private ArrayList arrUrl = new ArrayList(); // 存储所有URL供建立索引
private Hashtable allUrls = new Hashtable(); // 存储所有URL的网页号
private Hashtable deepUrls = new Hashtable(); // 存储所有URL深度
private int intWebIndex = 0; // 网页对应文件下标,从0开始
private long startTime;
private int webSuccessed = 0;
private int webFailed = 0;
public static void main(String[] args) {
GetWeb gw = new GetWeb("http://www.csdn.net/");
gw.getWebByHomePage();
}
public GetWeb(String s) {
this.strHomePage = s;
}
public GetWeb(String s, int i) {
this.strHomePage = s;
this.webDepth = i;
}
public synchronized void addWebSuccessed() {
webSuccessed++;
}
public synchronized void addWebFailed() {
webFailed++;
}
public synchronized String getAUrl() {
String tmpAUrl = arrUrls.get(0);
arrUrls.remove(0);
return tmpAUrl;
}
public synchronized String getUrl() {
String tmpUrl = arrUrl.get(0);
arrUrl.remove(0);
return tmpUrl;
}
public synchronized Integer getIntWebIndex() {
intWebIndex++;
return intWebIndex;
}
/**
* 由用户提供的域名站点开始,对所有链接页面进行抓取
*/
public void getWebByHomePage() {
startTime = System.currentTimeMillis();
this.myDomain = getDomain();
if (myDomain == null) {
System.out.println("Wrong input!");
return;
}
System.out.println("Homepage = " + strHomePage);
System.out.println("Domain = " + myDomain);
arrUrls.add(strHomePage);
arrUrl.add(strHomePage);
allUrls.put(strHomePage, 0);
deepUrls.put(strHomePage, 1);
File fDir = new File(fPath);
if (!fDir.exists()) {
fDir.mkdir();
}
System.out.println("开始工作");
String tmp = getAUrl(); // 取出新的URL
this.getWebByUrl(tmp, allUrls.get(tmp) + ""); // 对新URL所对应的网页进行抓取
int i = 0;
for (i = 0; i < intThreadNum; i++) {
new Thread(new Processer(this)).start();
}
while (true) {
if (arrUrls.isEmpty() && Thread.activeCount() == 1) {
long finishTime = System.currentTimeMillis();
long costTime = finishTime - startTime;
System.out.println("\n\n\n\n\n完成");
System.out.println(
"开始时间 = " + startTime + " " + "结束时间 = " + finishTime + " " + "爬取总时间= " + costTime + "ms");
System.out.println("爬取的URL总数 = " + (webSuccessed + webFailed) + " 成功的URL总数: " + webSuccessed
+ " 失败的URL总数: " + webFailed);
String strIndex = "";
String tmpUrl = "";
while (!arrUrl.isEmpty()) {
tmpUrl = getUrl();
strIndex += "Web depth:" + deepUrls.get(tmpUrl) + " Filepath: " + fPath + "/web"
+ allUrls.get(tmpUrl) + ".htm" + "url:" + tmpUrl + "\n\n";
}
System.out.println(strIndex);
try {
PrintWriter pwIndex = new PrintWriter(new FileOutputStream("fileindex.txt"));
pwIndex.println(strIndex);
pwIndex.close();
} catch (Exception e) {
System.out.println("生成索引文件失败!");
}
break;
}
}
}
/**
* 对后续解析的网站进行爬取
*
* @param strUrl
* @param fileIndex
*/
public void getWebByUrl(String strUrl, String fileIndex) {
try {
System.out.println("通过URL得到网站: " + strUrl);
URL url = new URL(strUrl);
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
InputStream is = null;
is = url.openStream();
String filename = strUrl.replaceAll("/", "_");
filename = filename.replace(":", ".");
if (filename.indexOf("*") > 0) {
filename = filename.replaceAll("*", ".");
}
if (filename.indexOf("?") > 0) {
filename = filename.replaceAll("?", ".");
}
if (filename.indexOf("\"") > 0) {
filename = filename.replaceAll("\"", ".");
}
if (filename.indexOf(">") > 0) {
filename = filename.replaceAll(">", ".");
}
if (filename.indexOf("