htmlunit抓取动态网页(上篇文章“Java爬虫_静态页面”动态爬虫工具介绍)

优采云 发布时间: 2021-09-17 05:06

  htmlunit抓取动态网页(上篇文章“Java爬虫_静态页面”动态爬虫工具介绍)

  一、crawler简介:请查看我之前的文章“Java crawler静态页面”

  二、dynamic crawler工具简介:

  1、IDEA,开发工具,创建Maven项目

  2、htmlunit:它是一个自动测试工具,集成了下载(httpclient)、DOM(nekohtml)和JS(rhino)

  3、otherjar包:JUnit、jsoup、JXL

  三、开发过程和相关代码

  3.1、createmaven项目

  

  image.png

  @k272、pom.xml将项目依赖项添加到

  

4.0.0

cll

demo

1.0-SNAPSHOT

UTF-8

UTF-8

UTF-8

1.7

1.7

junit

junit

4.11

test

net.sourceforge.htmlunit

htmlunit

2.27

org.jsoup

jsoup

1.8.3

org.apache.poi

poi

3.10.1

com.hynnet

jxl

2.6.12.1

  3.3、创建一个Java类qyemailhelper.Java

<p>import CaililiangTools.ConfigHelper;

import com.gargoylesoftware.htmlunit.BrowserVersion;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;

import com.gargoylesoftware.htmlunit.WebClient;

import com.gargoylesoftware.htmlunit.html.*;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.nodes.Node;

import org.jsoup.select.Elements;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.HashMap;

import java.util.Properties;

public class QYEmailHelper {

static WebClient webClient=new WebClient(BrowserVersion.CHROME);

ArrayList returnList = new ArrayList();

static String baseUrl ="";

static int num =1;

ConfigHelper configHelper = new ConfigHelper();

Properties properties=null;

//浏览器初始化

public void WebClientInit(){

webClient.getCookieManager().setCookiesEnabled(true);//设置cookie是否可用

webClient.getOptions().setActiveXNative(false);

webClient.getOptions().setRedirectEnabled(true);// 启动客户端重定向

webClient.getOptions().setCssEnabled(false);//禁用Css,可避免自动二次请求CSS进行渲染

webClient.getOptions().setJavaScriptEnabled(true); // 启动JS

webClient.getOptions().setUseInsecureSSL(true);//忽略ssl认证

webClient.getOptions().setThrowExceptionOnScriptError(false);//运行错误时,不抛出异常

webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);

webClient.setAjaxController(new NicelyResynchronizingAjaxController());// 设置Ajax异步

webClient.getOptions().setMaxInMemory(50000);

properties = configHelper.getEmailUserInfos();

}

public void closeWebClient(){

webClient.close();

webClient=new WebClient(BrowserVersion.CHROME);

}

//用户登录并返回收件箱的地址

public String UserLogin(String url,String name,String password) throws Exception{

url = url.replace("param=caill@primeton.com","param="+name);

final HtmlPage page = webClient.getPage(url);

System.err.println("查询中,请稍候");

//TimeUnit.SECONDS.sleep(3); //web请求数据需要时间,必须让主线程休眠片刻

HtmlForm form=page.getForms().get(0);

HtmlPasswordInput txtPwd = (HtmlPasswordInput)form.getInputByName("pp");//密码框

txtPwd.setValueAttribute(password);//设置密码

HtmlSubmitInput submit=(HtmlSubmitInput) form.getInputByValue("登录");

final HtmlPage page2 = (HtmlPage) submit.click();//登录进入

DomElement e =page2.getElementById("folder_1");

HtmlPage page3 = webClient.getPage("https://mail.primeton.com"+e.getAttribute("href"));

//TimeUnit.SECONDS.sleep(3); //web请求数据需要时间,必须让主线程休眠片刻

HtmlInlineFrame frame1 = (HtmlInlineFrame)page3.getElementById("mainFrame");

String src = frame1.getAttribute("src");

baseUrl="https://mail.primeton.com"+src;

return "https://mail.primeton.com"+src;

}

//抓取Url中的数据

public long getHtmlPage(String url,long startTime,long endTime) throws Exception{

HashMap returnMap = new HashMap();

long endTime2=0L;

HtmlPage page = webClient.getPage(url);

HtmlBody tbody = (HtmlBody) page.getBody();

DomNodeList lists = tbody.getElementsByTagName("table");

//System.out.println( page.asXml());

for(HtmlElement he:lists){

long time =0L;

HashMap results = new HashMap();

String xml = he.asXml();

if(xml.startsWith("

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线