Created
May 28, 2020 12:32
-
-
Save lewis-munyi/f6b6cef12e4167a16dd00e3840b85f6c to your computer and use it in GitHub Desktop.
Web scraping - Create datasets with Javascript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Run npm install puppeteer colors axios cli-progress | |
| // Run `node index.js` | |
| const puppeteer = require('puppeteer'); | |
| const fs = require('fs'); | |
| const axios = require('axios'); | |
| const colors = require('colors'); | |
| const cliProgress = require('cli-progress'); | |
| class BrowserClass { | |
| constructor() { | |
| // Viewport dimensions | |
| this.width = 1200; | |
| this.height = 800; | |
| // URLS list | |
| this.urls = []; | |
| this.url = null; | |
| this.filename = null; | |
| this.failedUrls = []; | |
| } | |
| parseUserQuery(query) { | |
| this.filename = query + ".txt"; // Create file name | |
| if (query.split(" ").length > 1) { | |
| this.url = `https://www.google.com/search?q=${query.split(' ').join("+")}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw` | |
| return | |
| } else if (query.split(" ").length === 1) { | |
| this.url = `https://www.google.com/search?q=${query}&rlz=1C1CHBD_enKE893KE893&sxsrf=ALeKk022coDKoBy8l-iiOi9X_fb805nnFw:1589622014644&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiV4om8i7jpAhXQzoUKHfuVAwoQ_AUoAXoECBoQAw` | |
| return | |
| } else { | |
| throw console.log("Pass the query in the class method"); | |
| } | |
| } | |
| async harvest() { | |
| return new Promise(async (resolve, reject) => { | |
| // Puppeteer instance | |
| const browser = await puppeteer.launch({ | |
| headless: false, | |
| devtools: false, | |
| args: [`--window-size=${this.width},${this.height}`] | |
| }); | |
| // Open page, set width and wait for it to load | |
| const page = await browser.newPage(); | |
| await page.setViewport({ | |
| width: this.width, | |
| height: this.height, | |
| deviceScaleFactor: 1, | |
| }); | |
| // Start listening for requests | |
| await page.setRequestInterception(true); | |
| await page.goto(this.url, { | |
| waitUntil: "networkidle2" | |
| }); | |
| console.info('Listening for http requests\n\n'); | |
| // Initialize counter and temp array | |
| let counter = 0 | |
| // let temp = []; | |
| // Request handler | |
| page.on('request', async (request) => { | |
| counter++; | |
| console.log("Request " + counter + ": URL: " + request.url()); | |
| // console.log( this.urls.length + " URLs, " + counter + " counter\n"); | |
| // Filter valid URLs | |
| if (request.url().substr(-4) === '.jpg' || request.url().substr(-5) === '.jpeg' || request.url().substr(-4) === '.png') { | |
| // console.log("Request is a valid image " + request.url()); | |
| if (this.urls.length === 50) { | |
| await this.downloadUrls(); // Export all links | |
| this.downloadUrls() | |
| page.close(); // Terminate the browser session once we have enough images | |
| resolve(); | |
| } | |
| this.urls.push(request.url()); // Add the URL to our URLs list | |
| } | |
| request.continue(); | |
| }); | |
| // Keep scrolling page until URLs list is equal to the required value. | |
| while (await page.evaluate(() => document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) && counter <= 40) { | |
| await page.evaluate((y) => { | |
| document.scrollingElement.scrollBy(0, y); // Scroll the length of the page | |
| }, 100); | |
| await page.waitFor(100); // Wait for the image skeletons to load | |
| } | |
| // Start clicking on the images to get the resource URLs | |
| await page.evaluate(() => { | |
| let elements = document.querySelectorAll('img.rg_i.Q4LuWd.tx8vtf'); | |
| console.log("Elements: " + elements.length); // Log the number of image elements found | |
| elements.forEach(async (element) => { | |
| await element.click(); | |
| }); | |
| }); | |
| }); | |
| } | |
| async downloadUrls() { | |
| // Create directory if it doesn't exist | |
| if (!fs.existsSync('./exports')) { | |
| fs.mkdirSync('./exports'); | |
| } | |
| // Write URLs to file | |
| await fs.appendFile(`./exports/${this.filename}.txt`, this.urls.join("\n"), (err) => { | |
| if (err) return err.message; | |
| }); | |
| } | |
| async downloadImages(foldername) { | |
| // foldername = foldername.substring(0, foldername.length - 4) || this.filename.substring(0, foldername.length - 4); | |
| foldername = foldername || this.filename.substring(0, this.filename.length - 4); | |
| // Read the file | |
| fs.readFile(`./exports/${foldername}.txt`, async (err, data) => { | |
| if (err) throw err; | |
| // Read data from the text file into the buffer | |
| const imageArray = data.toString().split("\n"); | |
| // Create folder if it !exists | |
| if (!fs.existsSync(`./exports/${foldername}`)) { | |
| fs.mkdirSync(`./exports/${foldername}`); | |
| } | |
| progressBar.start(50, 0); // Start progress bar | |
| // Loop through URLs and download the file | |
| for (let index = 0; index < 50; index++) { | |
| let url = imageArray[index]; | |
| // for(let [index, url] of imageArray.entries()) { | |
| try { | |
| if (url.substr(-4) === '.jpg') { | |
| await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpg`) | |
| } else if (url.substr(-4) === '.png') { | |
| await this.downloadImage(url, `./exports/${foldername}/${index + 1}.png`); | |
| } else { | |
| await this.downloadImage(url, `./exports/${foldername}/${index + 1}.jpeg`) | |
| } | |
| } catch (e) { | |
| // console.log(e.message); | |
| continue; | |
| } | |
| // progressBar.increment(); | |
| progressBar.update(index + 1, { | |
| errors: this.failedUrls.length | |
| }) | |
| } | |
| }) | |
| } | |
| async downloadImage(url, dest) { | |
| const writer = fs.createWriteStream(dest); | |
| return new Promise(async (resolve, reject) => { | |
| try { | |
| let { | |
| data, | |
| status | |
| } = await axios({ | |
| url, | |
| method: 'GET', | |
| responseType: 'stream' | |
| }); | |
| if (status === 200) { | |
| data.pipe(writer); | |
| writer.on('finish', () => { | |
| resolve() | |
| }); | |
| } else { | |
| this.failedUrls.push(url); | |
| } | |
| resolve() | |
| } catch (e) { | |
| this.failedUrls.push(url); | |
| // console.log(e.message); | |
| fs.unlinkSync(dest); | |
| resolve() | |
| } | |
| writer.on('error', (error) => { | |
| fs.closeSync(dest); | |
| fs.unlinkSync(dest); | |
| }) | |
| }); | |
| } | |
| } | |
| const progressBar = new cliProgress.SingleBar({ | |
| format: 'Downloading files... |' + colors.green('{bar}') + `| {percentage}% || {value}/{total} Images || Failed Downloads: {errors}`, | |
| barCompleteChar: '\u2588', | |
| barIncompleteChar: '\u2591', | |
| hideCursor: true, | |
| stopOnComplete: true | |
| }); | |
| // Start | |
| (async () => { | |
| const query = "Nyama choma"; | |
| // New browser instance | |
| const browser = new BrowserClass(); | |
| // Pass our input to class method | |
| await browser.parseUserQuery(query); | |
| // Start harvesting image URLS | |
| await browser.harvest(); | |
| await browser.downloadImages(); | |
| })(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "name": "puppeteer-app", | |
| "version": "1.0.0", | |
| "description": "Awesome JS web scraper", | |
| "main": "index.js", | |
| "scripts": { | |
| "test": "echo \"Error: no test specified\" && exit 1" | |
| }, | |
| "author": "", | |
| "license": "ISC", | |
| "dependencies": { | |
| "axios": "^0.19.2", | |
| "cli-progress": "^3.8.2", | |
| "colors": "^1.4.0", | |
| "puppeteer": "^3.1.0" | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment