htmlunit抓取动态网页(需要的jar包代码如下：链接不支持xpath解析)

优采云发布时间: 2022-03-23 02:24

　　需要的jar包：

　　1

2 org.jsoup

3 jsoup

4 1.10.3

5

　　代码如下：

　　 1 // 请求超时时间，30秒

2 public static final int TIME_OUT = 30*1000;

3 // 模拟浏览器请求头信息

4 public static Map headers = new HashMap();

5 static{

6 headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0");

7 headers.put("Accept", "text/html");

8 headers.put("Accept-Language", "zh-CN,zh");

9 }

10

11 //根据url获取html文档

12 protected Document getDoc(String url) throws IOException{

13 if(logger.isDebugEnabled())

14 logger.debug(url);

15 //新建一个连接

16 Connection conn = Jsoup.connect(url).timeout(TIME_OUT);

17 conn = conn.headers(headers);

18 conn = conn.proxy(Proxy.NO_PROXY);

19 Document doc = conn.get();

20

21 if(logger.isTraceEnabled()){

22 logger.trace("["+url+"]\n"+doc);

23 }

24 return doc;

25 }

　　 1 public static final String CHINAZ_ICP_URL = "http://icp.chinaz.com/?type=host&s=%s";

2 public List doHandler(String domain) {

3 List results = new ArrayList();

4 String url = String.format(CHINAZ_ICP_URL, domain);

5 Document doc;

6 try {

7 doc = this.getDoc(url);

8 // 获取当前页ICP信息所在标签

9 Elements eles = doc.select("ul.IcpMain01>li:lt(7)>p");

10

11 if(null == eles || eles.isEmpty()){

12 return results;

13 }

14 //获取ICP信息

15 for (Element element : eles) {

16 //当前元素为认证信息时，跳过

17 if("safe".equals(element.attr("id"))){

18 continue;

19 }

20 Node firstNode = element.childNode(0);

21 if(firstNode.childNodeSize() > 0){

22 results.add(element.child(0).text());

23 }else{

24 results.add(((TextNode)firstNode).text());

25 }

26 }

27 } catch (IOException e) {

28 logger.error("get Chinaz ICP message error :",e);

29 }

30 doc = null;

31 return results;

32 }

　　参考Jsoup的文档：链接

　　Jsoup不支持xpath解析，很蛋疼，不过有人要弄个支持xpath的东西---JsoupXpath，链接，有兴趣的网友可以自己试试！

　　三、htmlunit

　　支持Xpath解析，可以模拟浏览器动作，比如点击下一页，加载越来越多。文档链接：

　　需要的jar包

　　1

2 net.sourceforge.htmlunit

3 htmlunit

4 2.18

5

　　代码如下：

　　 1 import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;

2 import com.gargoylesoftware.htmlunit.TopLevelWindow;

3 import com.gargoylesoftware.htmlunit.WebClient;

4 import com.gargoylesoftware.htmlunit.html.HtmlPage;

5 import com.gargoylesoftware.htmlunit.html.HtmlTableRow;

6

7 import java.io.IOException;

8 import java.util.ArrayList;

9 import java.util.List;

10

11

12 public class UrlTest {

13

14 public static void main(String[] args) {

15 BaseCollector baseCollector = new BaseCollector();

16 WebClient webClient = baseCollector.getWebClient();

17 String url="http://htmlunit.sourceforge.net/";

18 HtmlPage homePage= null;

19 try {

20 homePage = webClient.getPage(url);

21 if (homePage != null && homePage instanceof HtmlPage) {

22 homePage.getEnclosingWindow().setName("IpHomePage");

23 System.out.println("打开 IPHomePage ");

24 System.out.println("内容是： "+homePage.getBody().getTextContent());

25

26 List htmlTableRows = (List) homePage.getByXPath("/html/body/pre");

27 if (htmlTableRows != null && htmlTableRows.size() > 0) {

28 for (int i = 0; i < htmlTableRows.size(); i++) {

29 HtmlTableRow htmlTableRow = htmlTableRows.get(i);

30 //日期

31 String firstTime = htmlTableRow.getCell(0).getTextContent().trim();

32 System.out.println(firstTime);

33 }

34

35 }

36 closeWindowByName(webClient, "IPHomePage");

37 System.out.println("关闭 IPHomePage ");

38 }

39 webClient.close();

40

41 } catch (IOException e) {

42 System.out.println(e.getMessage()+" ===="+e);

43 }catch (FailingHttpStatusCodeException e){

44 System.out.println(e.getMessage()+" ===="+e);

45 }

46 System.out.println("内容是： "+homePage.getBody().getTextContent());

47 }

48

49 public static void closeWindowByName(WebClient webClient, String name){

50 List list = webClient.getTopLevelWindows();

51 List windowNames = new ArrayList();

52 for (int j = 0; j < list.size(); j++) {

53 if(list.get(j).getName().equals(name)){

54 list.get(j).close();

55 }

56 windowNames.add(list.get(j).getName());

57 }

58 System.out.println("当前窗口： {}"+list.toString());

59 }

60 }

61

62

　　四、HeadlessChrome1，HeadlessChrome 与 PhantomJS 对比

　　在 Chrome 不提供原生无头模式之前，Web 开发者可以使用 PhantomJS 等第三方无头浏览器。现在 Headless 已经正式准备就绪，PhantomJS 的维护者 VitalySlobodin 在邮件列表中宣布了他的辞职。另一个流行的浏览器 Firefox 也准备提供 Headless 模式。

　　2、什么是HeadlessChrome

　　HeadlessChrome 是一种无界面形式的 Chrome 浏览器。您可以使用 Chrome 支持的所有功能运行您的程序，而无需打开浏览器。与现代浏览器相比，HeadlessChrome更方便测试web应用、获取网站的截图、爬取信息等。

　　3、环境配置

　　您需要先下载 chrome-driver。不同版本的 Chrome 对应不同的 Chrome 驱动程序。您可以通过此链接下载相应的Chrome驱动程序

　　支持各种元素的获取，List elements = driver.findElements(By.xpath("//*[@id=\"body\"]/ul[2]/li"));

　　可以模拟浏览器的各种动作，driver.findElement(By.linkText("Next")).click();

　　用Python做HeadlessChrome更方便简单，简直太神奇了。 . . . 链接：

　　你可以参考一下

　　需要的jar包：

　　1

2 org.seleniumhq.selenium

3 selenium-chrome-driver

4 3.11.0

5

　　代码如下：

<p> 1 import org.jsoup.Jsoup;

2 import org.jsoup.nodes.Document;

3 import org.openqa.selenium.By;

4 import org.openqa.selenium.WebDriver;

5 import org.openqa.selenium.WebElement;

6 import org.openqa.selenium.chrome.ChromeDriver;

7 import org.openqa.selenium.chrome.ChromeOptions;

8

9 import java.util.List;

10 import java.util.concurrent.TimeUnit;

11

12 /**

13 * Created by sqy on 2018/5/2.

14 */

15 public class HeadlessChromeTest {

16

17 public static void main(String args[]) {

18

19

20

21 //G:\chromedriver

22 System.setProperty("webdriver.chrome.driver","G:\\chromedriver\\chromedriver.exe");

23 ChromeOptions chromeOptions = new ChromeOptions();

24 // 设置为 headless 模式（必须）

25 chromeOptions.addArguments("--headless");

26 // 设置浏览器窗口打开大小（非必须）

27 chromeOptions.addArguments("--window-size=1920,1080");

28 WebDriver driver = new ChromeDriver(chromeOptions);

29 driver.get("https://lvyou.baidu.com/scene/s-feb/");

30

31 System.out.println("url: "+driver.getCurrentUrl());

32

33 for(int i=0;i

0

2022-03-23

htmlunit抓取动态网页

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

htmlunit抓取动态网页(需要的jar包代码如下：链接不支持xpath解析)

0 个评论

发起人

AI时代内容工厂

htmlunit抓取动态网页(需要的jar包代码如下：链接不支持xpath解析)

0 个评论

发起人

相关问题