nodejs抓取动态网页(/module/cvsresovle/js)
优采云 发布时间: 2022-02-13 18:13nodejs抓取动态网页(/module/cvsresovle/js)
有这样一个需求,先从cvs文件中读取要解析的url数据,然后使用puppeteer和puppeteer-har获取浏览器的HAR数据。在调试的过程中发现for循环中怎么操作是异步的,终于找到了解决办法,这里也记录一下。
1、创建解析csv文件的代码(ultra-harlog/module/cvsresovle.js)
const fs = require("fs");
const path = require("path");
const csv =require('csv');
const parse = require('csv-parse/lib/sync')
const iconv = require('iconv-lite');
/*
npm install iconv-lite
*/
function readUrlRecord(csvpath){
console.log('开始解析文件:' + csvpath) ;
//读取文件
const input = fs.readFileSync(csvpath,'utf8') ;
/*
解析文件,生成JSON格式
{ ' ': '142',
AREA_NAME: '湖北',
SITE_LINK: 'www.banggo.com',
BEARING_MODE: '移动接入',
SITE_NAME: '邦购',
MENU_TYPE: '二级' }
*/
const records = parse(input, {
columns: true,
skip_empty_lines: true,
delimiter: ',',
}) ;
return records ;
}
//readUrlRecord('../top300.csv') ;
exports.readUrlRecord = readUrlRecord;
2、创建抓取的主代码(ultra-harlog/module/puppeteerhar-event.js)
const fs = require('fs');
const { promisify } = require('util');
const path = require("path");
const puppeteer = require('puppeteer');
const { harFromMessages } = require('chrome-har');
const logger=require("./log");
const log = logger.getPuppeteerHarEventRecordLogger() ;
//https://michaljanaszek.com/blog/generate-har-with-puppeteer
//https://www.npmjs.com/package/chrome-har
// 设置要监控的事件
const observe = [
'Page.loadEventFired',
'Page.domContentEventFired',
'Page.frameStartedLoading',
'Page.frameAttached',
'Network.requestWillBeSent',
'Network.requestServedFromCache',
'Network.dataReceived',
'Network.responseReceived',
'Network.resourceChangedPriority',
'Network.loadingFinished',
'Network.loadingFailed',
];
/*
启动浏览器
*/
async function launchBrowser(){
//启动浏览器实例 [puppeteer.createBrowserFetcher([options])]
let browser = await puppeteer.launch({
// 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/
//executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',
//如果是访问https页面 此属性会忽略https错误
ignoreHTTPSErrors: true,
// 关闭headless模式, 不会打开浏览器
headless: true,
//浏览器启动参数 https://peter.sh/experiments/chromium-command-line-switches/ --timeout
args:["--disk-cache-size=0","--disable-cache",'--disable-infobars','--window-size=800,600','--ignore-certificate-errors','--enable-feaures'],
//是否为每个选项卡自动打开DevTools面板。 如果此选项为true,则headless选项将设置为false。
devtools: false,
//Defaults to 30000 (30 seconds). Pass 0 to disable timeout.
timeout: 0
//放慢puppeteer执行的动作,方便调试
//slowMo: 250
});
return browser ;
}
async function saveHarlog(url,dirPath,filename){
let homesite = url ;
//保存的文件路径
let harFilePath = path.join(dirPath,filename) ;
//处理URL
if(!(url.startsWith('http://') || url.startsWith('https://'))){
url = "http://" + url ;
}
//打开浏览器
let browser = await launchBrowser() ;
//创建一个新页面
//let page = await browser.newPage();
let page = (await browser.pages())[0];
// 注册事件*敏*感*词*器
const client = await page.target().createCDPSession();
await client.send('Page.enable');
await client.send('Network.enable');
//用于保存用于转为为HAR数据的事件
const events = [];
observe.forEach(method => {
client.on(method, params => {
events.push({ method, params });
});
});
try{
// 执行跳转,访问制定的资源
await page.goto(url,{
timeout:0
});
}catch(error){
log.info('resovle error :' + url + "; error message:" + error) ;
}finally{
if(browser){
await browser.close();
}
}
const har = harFromMessages(events);
//resovleHar(har) ;
//log.info(JSON.stringify(har));
await promisify(fs.writeFile)(harFilePath, JSON.stringify(har));
}
exports.launchBrowser = launchBrowser;
exports.saveHarlog = saveHarlog;
3、创建启动文件(ultra-harlog/puppeteerhar-event-app.js)