使用爬虫批量下载图片
zKing 2018-11-25 爬虫
摘要
偶尔会遇到这样需求,想要保存一些网站的图片,虽然可以用 chrome 开发者工具来进行下载,但是数量太多就很烦,且浪费时间,所以干脆就写个脚本吧
# 事先准备
首先上网查了下资料,确认需要的模块,然后快速上手
- fs + request 用来保存数据
- puppeteer 爬取数据
- bagpipe 处理大并发量请求,详情见文档
# 正式开始
const puppeteer = require('puppeteer');
const request = require('request')
const fs = require('fs')
const Bagpipe = require('bagpipe')
const sleep = timeout => new Promise((resolve, reject) => {
setTimeout(() => {
resolve()
}, timeout);
});
;(async () => {
const browser = await puppeteer.launch({
args: ['--no-sandbox'],
})
const page = await browser.newPage()
let url = 'https://www.jianshu.com/'
await page.goto(url, {
waitUntil: 'networkidle2'
})
await sleep(1000)
const result = await page.evaluate(() => {
let res = [];
let $ = window.$;
let list = $('.note-list')
list.find('li').each((index, ele) => {
let item = $(ele);
let imgUrl = item.find('img').attr('src');
if (imgUrl !== undefined) {
res.push({
imgUrl: "https:" + imgUrl
})
}
})
return res;
});
console.log(result)
var bagpipe = new Bagpipe(10, {
timeout: 100
});
let downloadPic = async (src, dest) => {
await request(src).pipe(fs.createWriteStream(dest)).on('close', function () {
console.log('pic saved!')
})
}
for (var i = 0; i < result.length; i++) {
console.log('i:' + i)
bagpipe.push(downloadPic, result[i].imgUrl, './catpics/' + i + '.jpg');
}
await browser.close()
})()
# 附加
既然用了puppeteer
,那么就顺手写下用 cheerio
来爬取数据的脚本吧
const cheerio = require('cheerio');
const request = require('request')
const fs = require('fs')
const Bagpipe = require('bagpipe')
let parseInfo = (html) => {
let res = []
let $ = cheerio.load(html)
let list = $('.note-list')
list.find('li').each((index, ele) => {
let item = $(ele);
let imgUrl = item.find('img').attr('src');
if (imgUrl !== undefined) {
res.push({
imgUrl: "https:" + imgUrl
})
}
})
return res;
}
request({
url: 'https://www.jianshu.com/',
method: 'GET',
}, function (error, response, body) {
if (!error && response.statusCode == 200) {
let result = parseInfo(body)
var bagpipe = new Bagpipe(10, {
timeout: 100
});
let downloadPic = async (src, dest) => {
await request(src).pipe(fs.createWriteStream(dest)).on('close', function () {
console.log('pic saved!')
})
}
for (var i = 0; i < result.length; i++) {
console.log('i:' + i)
bagpipe.push(downloadPic, result[i].imgUrl, './catpics/' + i + '.jpg');
}
}
});
# 个人总结
还记得大一大二的时候还觉得爬虫好难,但现在就觉得,其实也就一个多小时的事情,很快就可以上手了,不过也多亏很多人的努力,开源了这些模块,不然从零开始真的很难。