基于pupperteer的爬虫1 puppeteer中文文档:http://puppeteerjs.com/
开始 利用puppeteer文档中所给示例进行改动:
1 2 3 4 5 6 7 8 9 10 const puppeteer = require ('puppeteer' );(async () => { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com' ); await page.screenshot({path : 'example.png' }); await browser.close(); })();
更换目标网址:https://search.bilibili.com/all?keyword=CSS&from_source=nav_suggest_new
在网址中分析元素,获取要爬取的元素的选择器:
利用page元素的$$eval方法:
1 2 let href = await page.$$eval("div > div.headline.clearfix > a" , (links ) => links.map((x ) => x.href));hrefs = hrefs.concat(href);
利用page的click方法点击下一页按钮:
1 2 3 4 response = await Promise .all([ page.waitForNavigation(), page.click("#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button" ), ]);
将爬取结果存入json文件
1 2 3 4 5 fs.writeFile("data.json" , JSON .stringify(all, null , "\t" ), function (err ) { if (err) { console .log(err); } });
或者Excel文件:
1 2 3 4 5 fs.writeFile('a.xlsx' , buffer, function (err ) { if (err) { console .log("Write failed: " + err); return ; }
全部源码: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 const xlsx = require ('node-xlsx' )const fs = require ("fs" );const puppeteer = require ('puppeteer' );var hrefs = [];var titles = [];var all = [];var i = 0 ;(async () => { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://search.bilibili.com/all?keyword=CSS&from_source=nav_suggest_new' ); await page.waitForSelector("div > div.headline.clearfix > a" ); for (i = 0 ; i < 49 ; i++) { let href = await page.$$eval("div > div.headline.clearfix > a" , (links ) => links.map((x ) => x.href)); hrefs = hrefs.concat(href); let title = await page.$$eval("div > div.headline.clearfix > a" , (links ) => links.map((x ) => x.title)); titles = titles.concat(title); response = await Promise .all([ page.waitForNavigation(), page.click("#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button" ), ]); console .log(i); }; let href = await page.$$eval("div > div.headline.clearfix > a" , (links ) => links.map((x ) => x.href)); hrefs = hrefs.concat(href); let title = await page.$$eval("div > div.headline.clearfix > a" , (links ) => links.map((x ) => x.title)); titles = titles.concat(title); console .log(49 ); await browser.close(); var data = [{ name: 'sheet1' , data: [ hrefs, titles ] } ]; var buffer = xlsx.build(data);fs.writeFile('a.xlsx' , buffer, function (err ) { if (err) { console .log("Write failed: " + err); return ; } console .log("Write completed." ); }); })();