java爬虫抓取网页数据(github地址:快速开始自动下载最新chromium并启动(图))

优采云 发布时间: 2021-12-27 19:07

  java爬虫抓取网页数据(github地址:快速开始自动下载最新chromium并启动(图))

  github地址:

  快速入门

  自动下载最新的chrome并启动:

  package com.ruiyun.example;

import com.ruiyun.jvppeteer.core.Puppeteer;

import com.ruiyun.jvppeteer.core.browser.Browser;

import com.ruiyun.jvppeteer.core.browser.BrowserFetcher;

import java.io.IOException;

import java.util.concurrent.ExecutionException;

/**

* 展示下载最新的chromuim浏览器的例子

*/

public class DownloadChromiumExample2 {

public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {

Puppeteer puppeteer = new Puppeteer();

//创建下载实例

BrowserFetcher browserFetcher = puppeteer.createBrowserFetcher();

//下载最新版本的chromuim

browserFetcher.download();

Browser browser = Puppeteer.launch(false);

String version = browser.version();

System.out.println(version);

}

}

  抓取整个页面的内容:

  package com.ruiyun.example;

import com.ruiyun.jvppeteer.core.Puppeteer;

import com.ruiyun.jvppeteer.options.LaunchOptions;

import com.ruiyun.jvppeteer.options.OptionsBuilder;

import com.ruiyun.jvppeteer.core.browser.Browser;

import com.ruiyun.jvppeteer.core.page.Page;

import java.io.IOException;

import java.util.ArrayList;

public class PageContentExample {

public static void main(String[] args) throws InterruptedException, IOException {

String path = new String("F:\\java教程\\49期\\vuejs\\puppeteer\\.local-chromium\\win64-722234\\chrome-win\\chrome.exe".getBytes(),"UTF-8");

// String path ="D:\\develop\\project\\toString\\chrome-win\\chrome.exe";

ArrayList arrayList = new ArrayList();

LaunchOptions options = new OptionsBuilder().withArgs(arrayList).withHeadless(false).withExecutablePath(path).build();

arrayList.add("--no-sandbox");

arrayList.add("--disable-setuid-sandbox");

Browser browser = Puppeteer.launch(options);

Page page = browser.newPage();

page.goTo("https://www.baidu.com/?tn=98012088_10_dg&ch=3");

String content = page.content();

System.out.println("=======================content=============="+content);

}

}

  截图

  文件选择

  package com.ruiyun.example;

import com.ruiyun.jvppeteer.core.Puppeteer;

import com.ruiyun.jvppeteer.core.browser.Browser;

import com.ruiyun.jvppeteer.core.page.ElementHandle;

import com.ruiyun.jvppeteer.core.page.FileChooser;

import com.ruiyun.jvppeteer.core.page.Page;

import com.ruiyun.jvppeteer.options.LaunchOptions;

import com.ruiyun.jvppeteer.options.OptionsBuilder;

import com.ruiyun.jvppeteer.options.PageNavigateOptions;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.List;

import java.util.concurrent.ExecutionException;

import java.util.concurrent.Future;

public class PageFileChooserExample {

public static void main(String[] args) throws InterruptedException, ExecutionException, IOException {

// String path = new String("F:\\java教程\\49期\\vuejs\\puppeteer\\.local-chromium\\win64-722234\\chrome-win\\chrome.exe".getBytes(),"UTF-8");

ArrayList arrayList = new ArrayList();

String path = "D:\\develop\\project\\toString\\chrome-win\\chrome.exe";

LaunchOptions options = new OptionsBuilder().withArgs(arrayList).withHeadless(false).withExecutablePath(path).build();

arrayList.add("--no-sandbox");

arrayList.add("--disable-setuid-sandbox");

Browser browser = Puppeteer.launch(options);

Page page = browser.newPage();

PageNavigateOptions options1 = new PageNavigateOptions();

options1.setWaitUntil(Arrays.asList("domcontentloaded"));

page.goTo("https://www.baidu.com/?tn=98012088_10_dg&ch=3");

Future fileChooserFuture = page.waitForFileChooser(30000);

ElementHandle elementHandle = page.$("#form > span.bg.s_ipt_wr.quickdelete-wrap > span.soutu-btn");

elementHandle.click();

//点击选择文件的按钮

ElementHandle button = page.$("#form > div > div.soutu-state-normal > div.upload-wrap > input");

button.click();

//等待一个选择文件的弹窗事件返回

FileChooser fileChooser = fileChooserFuture.get();

//选择本地的文件

List paths = new ArrayList();

paths.add("C:\\Users\\howay\\Desktop\\sunway.png");

fileChooser.accept(paths);

}

}

  另外还有更多的功能,Jvppeteer可以做到:

  生成页面 PDF。抓取 SPA(单页应用程序)并生成预渲染的内容(即“SSR”(服务器端渲染))。自动提交表单、UI测试、键盘输入等,创建一个不断更新的自动化测试环境。使用最新的 JavaScript 和浏览器功能直接在最新版本的 Chrome 中执行测试。捕获网站的时间线轨迹以帮助分析性能问题。测试浏览器扩展。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线