nodejs抓取动态网页(/module/cvsresovle/js)

优采云 发布时间: 2022-02-13 18:13

  nodejs抓取动态网页(/module/cvsresovle/js)

  有这样一个需求,先从cvs文件中读取要解析的url数据,然后使用puppeteer和puppeteer-har获取浏览器的HAR数据。在调试的过程中发现for循环中怎么操作是异步的,终于找到了解决办法,这里也记录一下。

  1、创建解析csv文件的代码(ultra-harlog/module/cvsresovle.js)

  const fs = require("fs");

const path = require("path");

const csv =require('csv');

const parse = require('csv-parse/lib/sync')

const iconv = require('iconv-lite');

/*

npm install iconv-lite

*/

function readUrlRecord(csvpath){

console.log('开始解析文件:' + csvpath) ;

//读取文件

const input = fs.readFileSync(csvpath,'utf8') ;

/*

解析文件,生成JSON格式

{ ' ': '142',

AREA_NAME: '湖北',

SITE_LINK: 'www.banggo.com',

BEARING_MODE: '移动接入',

SITE_NAME: '邦购',

MENU_TYPE: '二级' }

*/

const records = parse(input, {

columns: true,

skip_empty_lines: true,

delimiter: ',',

}) ;

return records ;

}

//readUrlRecord('../top300.csv') ;

exports.readUrlRecord = readUrlRecord;

  2、创建抓取的主代码(ultra-harlog/module/puppeteerhar-event.js)

  const fs = require('fs');

const { promisify } = require('util');

const path = require("path");

const puppeteer = require('puppeteer');

const { harFromMessages } = require('chrome-har');

const logger=require("./log");

const log = logger.getPuppeteerHarEventRecordLogger() ;

//https://michaljanaszek.com/blog/generate-har-with-puppeteer

//https://www.npmjs.com/package/chrome-har

// 设置要监控的事件

const observe = [

'Page.loadEventFired',

'Page.domContentEventFired',

'Page.frameStartedLoading',

'Page.frameAttached',

'Network.requestWillBeSent',

'Network.requestServedFromCache',

'Network.dataReceived',

'Network.responseReceived',

'Network.resourceChangedPriority',

'Network.loadingFinished',

'Network.loadingFailed',

];

/*

启动浏览器

*/

async function launchBrowser(){

//启动浏览器实例 [puppeteer.createBrowserFetcher([options])]

let browser = await puppeteer.launch({

// 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/

//executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',

//如果是访问https页面 此属性会忽略https错误

ignoreHTTPSErrors: true,

// 关闭headless模式, 不会打开浏览器

headless: true,

//浏览器启动参数 https://peter.sh/experiments/chromium-command-line-switches/ --timeout

args:["--disk-cache-size=0","--disable-cache",'--disable-infobars','--window-size=800,600','--ignore-certificate-errors','--enable-feaures'],

//是否为每个选项卡自动打开DevTools面板。 如果此选项为true,则headless选项将设置为false。

devtools: false,

//Defaults to 30000 (30 seconds). Pass 0 to disable timeout.

timeout: 0

//放慢puppeteer执行的动作,方便调试

//slowMo: 250

});

return browser ;

}

async function saveHarlog(url,dirPath,filename){

let homesite = url ;

//保存的文件路径

let harFilePath = path.join(dirPath,filename) ;

//处理URL

if(!(url.startsWith('http://') || url.startsWith('https://'))){

url = "http://" + url ;

}

//打开浏览器

let browser = await launchBrowser() ;

//创建一个新页面

//let page = await browser.newPage();

let page = (await browser.pages())[0];

// 注册事件*敏*感*词*器

const client = await page.target().createCDPSession();

await client.send('Page.enable');

await client.send('Network.enable');

//用于保存用于转为为HAR数据的事件

const events = [];

observe.forEach(method => {

client.on(method, params => {

events.push({ method, params });

});

});

try{

// 执行跳转,访问制定的资源

await page.goto(url,{

timeout:0

});

}catch(error){

log.info('resovle error :' + url + "; error message:" + error) ;

}finally{

if(browser){

await browser.close();

}

}

const har = harFromMessages(events);

//resovleHar(har) ;

//log.info(JSON.stringify(har));

await promisify(fs.writeFile)(harFilePath, JSON.stringify(har));

}

exports.launchBrowser = launchBrowser;

exports.saveHarlog = saveHarlog;

  3、创建启动文件(ultra-harlog/puppeteerhar-event-app.js)

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线