Skip to content

Instantly share code, notes, and snippets.

@lewis-munyi
Created May 28, 2020 12:32
Show Gist options
  • Select an option

  • Save lewis-munyi/f6b6cef12e4167a16dd00e3840b85f6c to your computer and use it in GitHub Desktop.

Select an option

Save lewis-munyi/f6b6cef12e4167a16dd00e3840b85f6c to your computer and use it in GitHub Desktop.
Web scraping - Create datasets with Javascript
// Run npm install puppeteer colors axios cli-progress
// Run `node index.js`
const puppeteer = require('puppeteer');
const fs = require('fs');
const axios = require('axios');
const colors = require('colors');
const cliProgress = require('cli-progress');
class BrowserClass {
constructor() {
// Viewport dimensions
this.width = 1200;
this.height = 800;
// URLS list
this.urls = [];
this.url = null;
this.filename = null;
this.failedUrls = [];
}
parseUserQuery(query) {
this.filename = query + ".txt"; // Create file name
if (query.split(" ").length > 1) {
this.url = `https://www.google.com/search?q=${query.split(' ').join("+")}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw`
return
} else if (query.split(" ").length === 1) {
this.url = `https://www.google.com/search?q=${query}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw`
return
} else {
throw console.log("Pass the query in the class method");
}
}
async harvest() {
return new Promise(async (resolve, reject) => {
// Puppeteer instance
const browser = await puppeteer.launch({
headless: false,
devtools: false,
args: [`--window-size=${this.width},${this.height}`]
});
// Open page, set width and wait for it to load
const page = await browser.newPage();
await page.setViewport({
width: this.width,
height: this.height,
deviceScaleFactor: 1,
});
// Start listening for requests
await page.setRequestInterception(true);
await page.goto(this.url, {
waitUntil: "networkidle2"
});
console.info('Listening for http requests\n\n');
// Initialize counter and temp array
let counter = 0
// let temp = [];
// Request handler
page.on('request', async (request) => {
counter++;
console.log("Request " + counter + ": URL: " + request.url());
// console.log( this.urls.length + " URLs, " + counter + " counter\n");
// Filter valid URLs
if (request.url().substr(-4) === '.jpg' || request.url().substr(-5) === '.jpeg' || request.url().substr(-4) === '.png') {
// console.log("Request is a valid image " + request.url());
if (this.urls.length === 50) {
await this.downloadUrls(); // Export all links
this.downloadUrls()
page.close(); // Terminate the browser session once we have enough images
resolve();
}
this.urls.push(request.url()); // Add the URL to our URLs list
}
request.continue();
});
// Keep scrolling page until URLs list is equal to the required value.
while (await page.evaluate(() => document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) && counter <= 40) {
await page.evaluate((y) => {
document.scrollingElement.scrollBy(0, y); // Scroll the length of the page
}, 100);
await page.waitFor(100); // Wait for the image skeletons to load
}
// Start clicking on the images to get the resource URLs
await page.evaluate(() => {
let elements = document.querySelectorAll('img.rg_i.Q4LuWd.tx8vtf');
console.log("Elements: " + elements.length); // Log the number of image elements found
elements.forEach(async (element) => {
await element.click();
});
});
});
}
async downloadUrls() {
// Create directory if it doesn't exist
if (!fs.existsSync('./exports')) {
fs.mkdirSync('./exports');
}
// Write URLs to file
await fs.appendFile(`./exports/${this.filename}.txt`, this.urls.join("\n"), (err) => {
if (err) return err.message;
});
}
async downloadImages(foldername) {
// foldername = foldername.substring(0, foldername.length - 4) || this.filename.substring(0, foldername.length - 4);
foldername = foldername || this.filename.substring(0, this.filename.length - 4);
// Read the file
fs.readFile(`./exports/${foldername}.txt`, async (err, data) => {
if (err) throw err;
// Read data from the text file into the buffer
const imageArray = data.toString().split("\n");
// Create folder if it !exists
if (!fs.existsSync(`./exports/${foldername}`)) {
fs.mkdirSync(`./exports/${foldername}`);
}
progressBar.start(50, 0); // Start progress bar
// Loop through URLs and download the file
for (let index = 0; index < 50; index++) {
let url = imageArray[index];
// for(let [index, url] of imageArray.entries()) {
try {
if (url.substr(-4) === '.jpg') {
await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpg`)
} else if (url.substr(-4) === '.png') {
await this.downloadImage(url, `./exports/${foldername}/${index + 1}.png`);
} else {
await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpeg`)
}
} catch (e) {
// console.log(e.message);
continue;
}
// progressBar.increment();
progressBar.update(index + 1, {
errors: this.failedUrls.length
})
}
})
}
async downloadImage(url, dest) {
const writer = fs.createWriteStream(dest);
return new Promise(async (resolve, reject) => {
try {
let {
data,
status
} = await axios({
url,
method: 'GET',
responseType: 'stream'
});
if (status === 200) {
data.pipe(writer);
writer.on('finish', () => {
resolve()
});
} else {
this.failedUrls.push(url);
}
resolve()
} catch (e) {
this.failedUrls.push(url);
// console.log(e.message);
fs.unlinkSync(dest);
resolve()
}
writer.on('error', (error) => {
fs.closeSync(dest);
fs.unlinkSync(dest);
})
});
}
}
const progressBar = new cliProgress.SingleBar({
format: 'Downloading files... |' + colors.green('{bar}') + `| {percentage}% || {value}/{total} Images || Failed Downloads: {errors}`,
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true,
stopOnComplete: true
});
// Start
(async () => {
const query = "Nyama choma";
// New browser instance
const browser = new BrowserClass();
// Pass our input to class method
await browser.parseUserQuery(query);
// Start harvesting image URLS
await browser.harvest();
await browser.downloadImages();
})();
{
"name": "puppeteer-app",
"version": "1.0.0",
"description": "Awesome JS web scraper",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"axios": "^0.19.2",
"cli-progress": "^3.8.2",
"colors": "^1.4.0",
"puppeteer": "^3.1.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment