基于puppeteer的爬虫1

基于pupperteer的爬虫1

puppeteer中文文档:http://puppeteerjs.com/

开始

利用puppeteer文档中所给示例进行改动:

1
2
3
4
5
6
7
8
9
10
const puppeteer = require('puppeteer');

(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
await page.screenshot({path: 'example.png'});

await browser.close();
})();

更换目标网址:https://search.bilibili.com/all?keyword=CSS&from_source=nav_suggest_new

在网址中分析元素,获取要爬取的元素的选择器:

利用page元素的$$eval方法:

1
2
let href = await page.$$eval("div > div.headline.clearfix > a", (links) => links.map((x) => x.href));
hrefs = hrefs.concat(href);

利用page的click方法点击下一页按钮:

1
2
3
4
response = await Promise.all([
page.waitForNavigation(),
page.click("#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button"),
]);

将爬取结果存入json文件

1
2
3
4
5
fs.writeFile("data.json", JSON.stringify(all, null, "\t"), function (err) {
if (err) {
console.log(err);
}
});

或者Excel文件:

1
2
3
4
5
fs.writeFile('a.xlsx', buffer, function(err) {
if (err) {
console.log("Write failed: " + err);
return;
}

全部源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
const xlsx = require('node-xlsx')
const fs = require("fs");
const puppeteer = require('puppeteer');
var hrefs = [];
var titles = [];
var all = [];
var i = 0;

(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://search.bilibili.com/all?keyword=CSS&from_source=nav_suggest_new');
await page.waitForSelector("div > div.headline.clearfix > a");

for (i = 0; i < 49; i++) {
let href = await page.$$eval("div > div.headline.clearfix > a", (links) => links.map((x) => x.href));
hrefs = hrefs.concat(href);
let title = await page.$$eval("div > div.headline.clearfix > a", (links) => links.map((x) => x.title));
titles = titles.concat(title);
response = await Promise.all([
page.waitForNavigation(),
page.click("#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button"),
]);
console.log(i);
};

let href = await page.$$eval("div > div.headline.clearfix > a", (links) => links.map((x) => x.href));
hrefs = hrefs.concat(href);
let title = await page.$$eval("div > div.headline.clearfix > a", (links) => links.map((x) => x.title));
titles = titles.concat(title);
console.log(49);
await browser.close();

var data = [{
name: 'sheet1',
data: [
hrefs,
titles
]
}
];
var buffer = xlsx.build(data);

// 写入文件
fs.writeFile('a.xlsx', buffer, function(err) {
if (err) {
console.log("Write failed: " + err);
return;
}

console.log("Write completed.");

});
})();