lewis-munyi/index.js

## index.js
// Run npm install puppeteer colors axios cli-progress

// Run `node index.js`

const puppeteer = require('puppeteer');
const fs = require('fs');
const axios = require('axios');
const colors = require('colors');
const cliProgress = require('cli-progress');

class BrowserClass {
    constructor() {
        // Viewport dimensions
        this.width = 1200;
        this.height = 800;

        // URLS list
        this.urls = [];
        this.url = null;
        this.filename = null;
        this.failedUrls = [];
    }

    parseUserQuery(query) {
        this.filename = query + ".txt"; // Create file name

        if (query.split(" ").length > 1) {
            this.url = `https://www.google.com/search?q=${query.split(' ').join("+")}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw`
            return
        } else if (query.split(" ").length === 1) {
            this.url = `https://www.google.com/search?q=${query}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw`
            return
        } else {
            throw console.log("Pass the query in the class method");
        }
    }

    async harvest() {
        return new Promise(async (resolve, reject) => {
            // Puppeteer instance
            const browser = await puppeteer.launch({
                headless: false,
                devtools: false,
                args: [`--window-size=${this.width},${this.height}`]
            });

            // Open page, set width and wait for it to load
            const page = await browser.newPage();
            await page.setViewport({
                width: this.width,
                height: this.height,
                deviceScaleFactor: 1,
            });

            // Start listening for requests
            await page.setRequestInterception(true);

            await page.goto(this.url, {
                waitUntil: "networkidle2"
            });


            console.info('Listening for http requests\n\n');

            // Initialize counter and temp array
            let counter = 0
            // let temp = [];

            // Request handler
            page.on('request', async (request) => {
                counter++;
                console.log("Request " + counter + ": URL: " + request.url());
                // console.log( this.urls.length + " URLs, " + counter + " counter\n");

                // Filter valid URLs
                if (request.url().substr(-4) === '.jpg' || request.url().substr(-5) === '.jpeg' || request.url().substr(-4) === '.png') {
                    // console.log("Request is a valid image " + request.url());

                    if (this.urls.length === 50) {
                        await this.downloadUrls(); // Export all links
                        this.downloadUrls()
                        page.close(); // Terminate the browser session once we have enough images
                        resolve();
                    }

                    this.urls.push(request.url()); // Add the URL to our URLs list
                }
                request.continue();
            });

            // Keep scrolling page until URLs list is equal to the required value.
            while (await page.evaluate(() => document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) && counter <= 40) {
                await page.evaluate((y) => {
                    document.scrollingElement.scrollBy(0, y); // Scroll the length of the page
                }, 100);
                await page.waitFor(100); // Wait for the image skeletons to load
            }


            // Start clicking on the images to get the resource URLs
            await page.evaluate(() => {
                let elements = document.querySelectorAll('img.rg_i.Q4LuWd.tx8vtf');
                console.log("Elements: " + elements.length); // Log the number of image elements found
                elements.forEach(async (element) => {
                    await element.click();
                });
            });
        });
    }

    async downloadUrls() {
        // Create directory if it doesn't exist
        if (!fs.existsSync('./exports')) {
            fs.mkdirSync('./exports');
        }

        // Write URLs to file
        await fs.appendFile(`./exports/${this.filename}.txt`, this.urls.join("\n"), (err) => {
            if (err) return err.message;
        });
    }

    async downloadImages(foldername) {
        // foldername = foldername.substring(0, foldername.length - 4) || this.filename.substring(0, foldername.length - 4);
        foldername = foldername || this.filename.substring(0, this.filename.length - 4);

        // Read the file
        fs.readFile(`./exports/${foldername}.txt`, async (err, data) => {
            if (err) throw err;

            // Read data from the text file into the buffer
            const imageArray = data.toString().split("\n");

            // Create folder if it !exists
            if (!fs.existsSync(`./exports/${foldername}`)) {
                fs.mkdirSync(`./exports/${foldername}`);
            }

            progressBar.start(50, 0); // Start progress bar

            // Loop through URLs and download the file
            for (let index = 0; index < 50; index++) {
                let url = imageArray[index];
                // for(let [index, url] of imageArray.entries()) {
                try {
                    if (url.substr(-4) === '.jpg') {
                        await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpg`)
                    } else if (url.substr(-4) === '.png') {
                        await this.downloadImage(url, `./exports/${foldername}/${index + 1}.png`);
                    } else {
                        await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpeg`)
                    }
                } catch (e) {
                    // console.log(e.message);
                    continue;
                }
                // progressBar.increment();
                progressBar.update(index + 1, {
                    errors: this.failedUrls.length
                })
            }
        })
    }

    async downloadImage(url, dest) {
        const writer = fs.createWriteStream(dest);

        return new Promise(async (resolve, reject) => {
            try {
                let {
                    data,
                    status
                } = await axios({
                    url,
                    method: 'GET',
                    responseType: 'stream'
                });
                if (status === 200) {
                    data.pipe(writer);
                    writer.on('finish', () => {
                        resolve()
                    });
                } else {
                    this.failedUrls.push(url);
                }
                resolve()
            } catch (e) {
                this.failedUrls.push(url);
                // console.log(e.message);
                fs.unlinkSync(dest);
                resolve()
            }
            writer.on('error', (error) => {
                fs.closeSync(dest);
                fs.unlinkSync(dest);
            })
        });

    }
}

const progressBar = new cliProgress.SingleBar({
    format: 'Downloading files... |' + colors.green('{bar}') + `| {percentage}% || {value}/{total} Images || Failed Downloads: {errors}`,
    barCompleteChar: '\u2588',
    barIncompleteChar: '\u2591',
    hideCursor: true,
    stopOnComplete: true
});

// Start

(async () => {

    const query = "Nyama choma";

    // New browser instance
    const browser = new BrowserClass();

    // Pass our input to class method
    await browser.parseUserQuery(query);

    // Start harvesting image URLS
    await browser.harvest();

    await browser.downloadImages();
})();

## package.json
{
  "name": "puppeteer-app",
  "version": "1.0.0",
  "description": "Awesome JS web scraper",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "axios": "^0.19.2",
    "cli-progress": "^3.8.2",
    "colors": "^1.4.0",
    "puppeteer": "^3.1.0"
  }
}
	// Run npm install puppeteer colors axios cli-progress

	// Run `node index.js`

	const puppeteer = require('puppeteer');
	const fs = require('fs');
	const axios = require('axios');
	const colors = require('colors');
	const cliProgress = require('cli-progress');

	class BrowserClass {
	constructor() {
	// Viewport dimensions
	this.width = 1200;
	this.height = 800;

	// URLS list
	this.urls = [];
	this.url = null;
	this.filename = null;
	this.failedUrls = [];
	}

	parseUserQuery(query) {
	this.filename = query + ".txt"; // Create file name

	if (query.split(" ").length > 1) {
	this.url = `https://www.google.com/search?q=${query.split(' ').join("+")}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw`
	return
	} else if (query.split(" ").length === 1) {
	this.url = `https://www.google.com/search?q=${query}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw`
	return
	} else {
	throw console.log("Pass the query in the class method");
	}
	}

	async harvest() {
	return new Promise(async (resolve, reject) => {
	// Puppeteer instance
	const browser = await puppeteer.launch({
	headless: false,
	devtools: false,
	args: [`--window-size=${this.width},${this.height}`]
	});

	// Open page, set width and wait for it to load
	const page = await browser.newPage();
	await page.setViewport({
	width: this.width,
	height: this.height,
	deviceScaleFactor: 1,
	});

	// Start listening for requests
	await page.setRequestInterception(true);

	await page.goto(this.url, {
	waitUntil: "networkidle2"
	});


	console.info('Listening for http requests\n\n');

	// Initialize counter and temp array
	let counter = 0
	// let temp = [];

	// Request handler
	page.on('request', async (request) => {
	counter++;
	console.log("Request " + counter + ": URL: " + request.url());
	// console.log( this.urls.length + " URLs, " + counter + " counter\n");

	// Filter valid URLs
	if (request.url().substr(-4) === '.jpg' \|\| request.url().substr(-5) === '.jpeg' \|\| request.url().substr(-4) === '.png') {
	// console.log("Request is a valid image " + request.url());

	if (this.urls.length === 50) {
	await this.downloadUrls(); // Export all links
	this.downloadUrls()
	page.close(); // Terminate the browser session once we have enough images
	resolve();
	}

	this.urls.push(request.url()); // Add the URL to our URLs list
	}
	request.continue();
	});

	// Keep scrolling page until URLs list is equal to the required value.
	while (await page.evaluate(() => document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) && counter <= 40) {
	await page.evaluate((y) => {
	document.scrollingElement.scrollBy(0, y); // Scroll the length of the page
	}, 100);
	await page.waitFor(100); // Wait for the image skeletons to load
	}


	// Start clicking on the images to get the resource URLs
	await page.evaluate(() => {
	let elements = document.querySelectorAll('img.rg_i.Q4LuWd.tx8vtf');
	console.log("Elements: " + elements.length); // Log the number of image elements found
	elements.forEach(async (element) => {
	await element.click();
	});
	});
	});
	}

	async downloadUrls() {
	// Create directory if it doesn't exist
	if (!fs.existsSync('./exports')) {
	fs.mkdirSync('./exports');
	}

	// Write URLs to file
	await fs.appendFile(`./exports/${this.filename}.txt`, this.urls.join("\n"), (err) => {
	if (err) return err.message;
	});
	}

	async downloadImages(foldername) {
	// foldername = foldername.substring(0, foldername.length - 4) \|\| this.filename.substring(0, foldername.length - 4);
	foldername = foldername \|\| this.filename.substring(0, this.filename.length - 4);

	// Read the file
	fs.readFile(`./exports/${foldername}.txt`, async (err, data) => {
	if (err) throw err;

	// Read data from the text file into the buffer
	const imageArray = data.toString().split("\n");

	// Create folder if it !exists
	if (!fs.existsSync(`./exports/${foldername}`)) {
	fs.mkdirSync(`./exports/${foldername}`);
	}

	progressBar.start(50, 0); // Start progress bar

	// Loop through URLs and download the file
	for (let index = 0; index < 50; index++) {
	let url = imageArray[index];
	// for(let [index, url] of imageArray.entries()) {
	try {
	if (url.substr(-4) === '.jpg') {
	await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpg`)
	} else if (url.substr(-4) === '.png') {
	await this.downloadImage(url, `./exports/${foldername}/${index + 1}.png`);
	} else {
	await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpeg`)
	}
	} catch (e) {
	// console.log(e.message);
	continue;
	}
	// progressBar.increment();
	progressBar.update(index + 1, {
	errors: this.failedUrls.length
	})
	}
	})
	}

	async downloadImage(url, dest) {
	const writer = fs.createWriteStream(dest);

	return new Promise(async (resolve, reject) => {
	try {
	let {
	data,
	status
	} = await axios({
	url,
	method: 'GET',
	responseType: 'stream'
	});
	if (status === 200) {
	data.pipe(writer);
	writer.on('finish', () => {
	resolve()
	});
	} else {
	this.failedUrls.push(url);
	}
	resolve()
	} catch (e) {
	this.failedUrls.push(url);
	// console.log(e.message);
	fs.unlinkSync(dest);
	resolve()
	}
	writer.on('error', (error) => {
	fs.closeSync(dest);
	fs.unlinkSync(dest);
	})
	});

	}
	}

	const progressBar = new cliProgress.SingleBar({
	format: 'Downloading files... \|' + colors.green('{bar}') + `\| {percentage}% \|\| {value}/{total} Images \|\| Failed Downloads: {errors}`,
	barCompleteChar: '\u2588',
	barIncompleteChar: '\u2591',
	hideCursor: true,
	stopOnComplete: true
	});

	// Start

	(async () => {

	const query = "Nyama choma";

	// New browser instance
	const browser = new BrowserClass();

	// Pass our input to class method
	await browser.parseUserQuery(query);

	// Start harvesting image URLS
	await browser.harvest();

	await browser.downloadImages();
	})();
	{
	"name": "puppeteer-app",
	"version": "1.0.0",
	"description": "Awesome JS web scraper",
	"main": "index.js",
	"scripts": {
	"test": "echo \"Error: no test specified\" && exit 1"
	},
	"author": "",
	"license": "ISC",
	"dependencies": {
	"axios": "^0.19.2",
	"cli-progress": "^3.8.2",
	"colors": "^1.4.0",
	"puppeteer": "^3.1.0"
	}
	}