java爬虫抓取网页数据(github地址:快速开始自动下载最新chromium并启动(图))
优采云 发布时间: 2021-12-27 19:07java爬虫抓取网页数据(github地址:快速开始自动下载最新chromium并启动(图))
github地址:
快速入门
自动下载最新的chrome并启动:
package com.ruiyun.example;
import com.ruiyun.jvppeteer.core.Puppeteer;
import com.ruiyun.jvppeteer.core.browser.Browser;
import com.ruiyun.jvppeteer.core.browser.BrowserFetcher;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
/**
* 展示下载最新的chromuim浏览器的例子
*/
public class DownloadChromiumExample2 {
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
Puppeteer puppeteer = new Puppeteer();
//创建下载实例
BrowserFetcher browserFetcher = puppeteer.createBrowserFetcher();
//下载最新版本的chromuim
browserFetcher.download();
Browser browser = Puppeteer.launch(false);
String version = browser.version();
System.out.println(version);
}
}
抓取整个页面的内容:
package com.ruiyun.example;
import com.ruiyun.jvppeteer.core.Puppeteer;
import com.ruiyun.jvppeteer.options.LaunchOptions;
import com.ruiyun.jvppeteer.options.OptionsBuilder;
import com.ruiyun.jvppeteer.core.browser.Browser;
import com.ruiyun.jvppeteer.core.page.Page;
import java.io.IOException;
import java.util.ArrayList;
public class PageContentExample {
public static void main(String[] args) throws InterruptedException, IOException {
String path = new String("F:\\java教程\\49期\\vuejs\\puppeteer\\.local-chromium\\win64-722234\\chrome-win\\chrome.exe".getBytes(),"UTF-8");
// String path ="D:\\develop\\project\\toString\\chrome-win\\chrome.exe";
ArrayList arrayList = new ArrayList();
LaunchOptions options = new OptionsBuilder().withArgs(arrayList).withHeadless(false).withExecutablePath(path).build();
arrayList.add("--no-sandbox");
arrayList.add("--disable-setuid-sandbox");
Browser browser = Puppeteer.launch(options);
Page page = browser.newPage();
page.goTo("https://www.baidu.com/?tn=98012088_10_dg&ch=3");
String content = page.content();
System.out.println("=======================content=============="+content);
}
}
截图
文件选择
package com.ruiyun.example;
import com.ruiyun.jvppeteer.core.Puppeteer;
import com.ruiyun.jvppeteer.core.browser.Browser;
import com.ruiyun.jvppeteer.core.page.ElementHandle;
import com.ruiyun.jvppeteer.core.page.FileChooser;
import com.ruiyun.jvppeteer.core.page.Page;
import com.ruiyun.jvppeteer.options.LaunchOptions;
import com.ruiyun.jvppeteer.options.OptionsBuilder;
import com.ruiyun.jvppeteer.options.PageNavigateOptions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
public class PageFileChooserExample {
public static void main(String[] args) throws InterruptedException, ExecutionException, IOException {
// String path = new String("F:\\java教程\\49期\\vuejs\\puppeteer\\.local-chromium\\win64-722234\\chrome-win\\chrome.exe".getBytes(),"UTF-8");
ArrayList arrayList = new ArrayList();
String path = "D:\\develop\\project\\toString\\chrome-win\\chrome.exe";
LaunchOptions options = new OptionsBuilder().withArgs(arrayList).withHeadless(false).withExecutablePath(path).build();
arrayList.add("--no-sandbox");
arrayList.add("--disable-setuid-sandbox");
Browser browser = Puppeteer.launch(options);
Page page = browser.newPage();
PageNavigateOptions options1 = new PageNavigateOptions();
options1.setWaitUntil(Arrays.asList("domcontentloaded"));
page.goTo("https://www.baidu.com/?tn=98012088_10_dg&ch=3");
Future fileChooserFuture = page.waitForFileChooser(30000);
ElementHandle elementHandle = page.$("#form > span.bg.s_ipt_wr.quickdelete-wrap > span.soutu-btn");
elementHandle.click();
//点击选择文件的按钮
ElementHandle button = page.$("#form > div > div.soutu-state-normal > div.upload-wrap > input");
button.click();
//等待一个选择文件的弹窗事件返回
FileChooser fileChooser = fileChooserFuture.get();
//选择本地的文件
List paths = new ArrayList();
paths.add("C:\\Users\\howay\\Desktop\\sunway.png");
fileChooser.accept(paths);
}
}
另外还有更多的功能,Jvppeteer可以做到:
生成页面 PDF。抓取 SPA(单页应用程序)并生成预渲染的内容(即“SSR”(服务器端渲染))。自动提交表单、UI测试、键盘输入等,创建一个不断更新的自动化测试环境。使用最新的 JavaScript 和浏览器功能直接在最新版本的 Chrome 中执行测试。捕获网站的时间线轨迹以帮助分析性能问题。测试浏览器扩展。