Last active
December 3, 2022 11:29
-
-
Save everylittlefox/eeb9c2a4f487ce94b528caa6a191b186 to your computer and use it in GitHub Desktop.
An express server that crawls Libary Genesis (https://libgen.rocks) and exposes an endpoint '/search' for searching books, comics, articles and other resources.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { load } from 'cheerio' | |
| import got from 'got' | |
| import url from 'url' | |
| import express from 'express' | |
| import http from 'http' | |
| import 'express-async-errors' | |
| const app = express() | |
| const validationError = (message) => { | |
| const error = new Error(message) | |
| error.name = 'ValidationError' | |
| return error | |
| } | |
| const validCategories = { | |
| 'non-fiction': 'l', | |
| fiction: 'f', | |
| 'scientific-articles': 'a' | |
| } | |
| const isValidCategories = (categories) => { | |
| return categories | |
| .split(',') | |
| .filter((t) => !Object.keys(validCategories).includes(t)) | |
| } | |
| const categoriesToTopics = (categories) => | |
| categories | |
| .split(',') | |
| .map((c) => validCategories[c]) | |
| .join(',') | |
| app.use((req, res, next) => { | |
| console.log(req.method, req.path) | |
| console.table(req.query) | |
| next() | |
| }) | |
| app.get('/search', async (req, res) => { | |
| const query = req.query.query | |
| if (!query) { | |
| throw validationError('query parameter is required') | |
| } | |
| const page = req.query.page | |
| if (page && (isNaN(+page) || +page < 1)) { | |
| throw validationError('page parameter can only take numbers >= 1') | |
| } | |
| const categories = req.query.categories | |
| let errs | |
| if (categories && (errs = isValidCategories(categories)).length) | |
| throw validationError(`unsupported categories: ${errs.join(', ')}`) | |
| if (req.query.perPage && ![25, 50, 100].includes(+req.query.perPage)) | |
| throw validationError(`perPage can only be: 25, 50 or 100`) | |
| const searchResults = await getVolumes( | |
| query, | |
| categories ? categoriesToTopics(categories) : undefined, | |
| page ? +page : undefined, | |
| req.query.perPage ? +req.query.perPage : undefined | |
| ) | |
| return res.json(searchResults) | |
| }) | |
| app.use((req, res, next) => { | |
| res.status(404).end() | |
| next() | |
| }) | |
| app.use((error, req, res, next) => { | |
| console.log(error) | |
| if (error.name === 'ValidationError') | |
| return res.status(400).json({ error: error.message }) | |
| return res.status(500).end() | |
| }) | |
| const server = http.createServer(app) | |
| const PORT = 3004 | |
| server.listen(PORT, () => console.log(`serving on ${PORT}...`)) | |
| async function getVolumes(query, topics, page = 1, perPage = 100) { | |
| const reqUrl = new url.URL('https://libgen.rocks/index.php') | |
| reqUrl.searchParams.append('req', encodeURI(query)) | |
| reqUrl.searchParams.append('res', perPage) | |
| if (topics) reqUrl.searchParams.append('topics', topics) | |
| if (page) reqUrl.searchParams.append('page', page) | |
| console.log('url', reqUrl.toString()) | |
| const res = await got.get(reqUrl) | |
| const $ = load(res.body) | |
| const results = [] | |
| $('table#tablelibgen > tbody > tr').each(function () { | |
| const fields = $(this).children('td') | |
| const titleTd = fields.get(0) | |
| const as = $(titleTd).children('a') | |
| if (as.length) { | |
| const titleAnchor = $(as.get(0)) | |
| const idMatch = titleAnchor.attr('title').match(/ID\: ([0-9]+)/) | |
| const title = titleAnchor | |
| .text() | |
| .split('\n') | |
| .map((part) => part.trim()) | |
| .join(' ') | |
| .trim() | |
| const nobr = $(titleTd).children('nobr').get(0) | |
| const typeSpan = $(nobr).children('span').get(0) | |
| const typeAnchor = $(typeSpan).children('a').get(0) | |
| const type = typeAnchor.attribs.title | |
| results.push({ | |
| id: idMatch ? +idMatch[1] : undefined, | |
| title, | |
| type, | |
| authors: $(fields.get(1)) | |
| .text() | |
| .replace('[...]', '') | |
| .split(';') | |
| .map((a) => a.trim()), | |
| publisher: $(fields.get(2)).text(), | |
| year: $(fields.get(3)).text(), | |
| language: $(fields.get(4)).text(), | |
| pages: $(fields.get(5)).text(), | |
| size: $(fields.get(6)).text(), | |
| extension: $(fields.get(7)).text(), | |
| mirros: fields | |
| .last() | |
| .children('a') | |
| .toArray() | |
| .map((a) => ({ | |
| link: a.attribs['href'], | |
| title: $(a).attr('title') | |
| })) | |
| }) | |
| } | |
| }) | |
| const paginator = matchPaginator(res.body) | |
| if (paginator) { | |
| console.log(paginator) | |
| return { | |
| data: results, | |
| page, | |
| totalPages: paginator.maxPage, | |
| next: page + 1 <= paginator.maxPage, | |
| prev: page - 1 > 0 | |
| } | |
| } | |
| return { resources: results } | |
| } | |
| const matchPaginator = (html) => { | |
| const matches = html.match( | |
| /new Paginator\("paginator_example_top", ([0-9]+), ([0-9]+), ([0-9]+)/ | |
| ) | |
| if (matches) | |
| return { | |
| perPage: +matches[2], | |
| maxPage: +matches[1], | |
| currentPage: +matches[3] | |
| } | |
| return null | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment