Skip to content

Instantly share code, notes, and snippets.

@everylittlefox
Last active December 3, 2022 11:29
Show Gist options
  • Select an option

  • Save everylittlefox/eeb9c2a4f487ce94b528caa6a191b186 to your computer and use it in GitHub Desktop.

Select an option

Save everylittlefox/eeb9c2a4f487ce94b528caa6a191b186 to your computer and use it in GitHub Desktop.
An express server that crawls Libary Genesis (https://libgen.rocks) and exposes an endpoint '/search' for searching books, comics, articles and other resources.
import { load } from 'cheerio'
import got from 'got'
import url from 'url'
import express from 'express'
import http from 'http'
import 'express-async-errors'
const app = express()
const validationError = (message) => {
const error = new Error(message)
error.name = 'ValidationError'
return error
}
const validCategories = {
'non-fiction': 'l',
fiction: 'f',
'scientific-articles': 'a'
}
const isValidCategories = (categories) => {
return categories
.split(',')
.filter((t) => !Object.keys(validCategories).includes(t))
}
const categoriesToTopics = (categories) =>
categories
.split(',')
.map((c) => validCategories[c])
.join(',')
app.use((req, res, next) => {
console.log(req.method, req.path)
console.table(req.query)
next()
})
app.get('/search', async (req, res) => {
const query = req.query.query
if (!query) {
throw validationError('query parameter is required')
}
const page = req.query.page
if (page && (isNaN(+page) || +page < 1)) {
throw validationError('page parameter can only take numbers >= 1')
}
const categories = req.query.categories
let errs
if (categories && (errs = isValidCategories(categories)).length)
throw validationError(`unsupported categories: ${errs.join(', ')}`)
if (req.query.perPage && ![25, 50, 100].includes(+req.query.perPage))
throw validationError(`perPage can only be: 25, 50 or 100`)
const searchResults = await getVolumes(
query,
categories ? categoriesToTopics(categories) : undefined,
page ? +page : undefined,
req.query.perPage ? +req.query.perPage : undefined
)
return res.json(searchResults)
})
app.use((req, res, next) => {
res.status(404).end()
next()
})
app.use((error, req, res, next) => {
console.log(error)
if (error.name === 'ValidationError')
return res.status(400).json({ error: error.message })
return res.status(500).end()
})
const server = http.createServer(app)
const PORT = 3004
server.listen(PORT, () => console.log(`serving on ${PORT}...`))
async function getVolumes(query, topics, page = 1, perPage = 100) {
const reqUrl = new url.URL('https://libgen.rocks/index.php')
reqUrl.searchParams.append('req', encodeURI(query))
reqUrl.searchParams.append('res', perPage)
if (topics) reqUrl.searchParams.append('topics', topics)
if (page) reqUrl.searchParams.append('page', page)
console.log('url', reqUrl.toString())
const res = await got.get(reqUrl)
const $ = load(res.body)
const results = []
$('table#tablelibgen > tbody > tr').each(function () {
const fields = $(this).children('td')
const titleTd = fields.get(0)
const as = $(titleTd).children('a')
if (as.length) {
const titleAnchor = $(as.get(0))
const idMatch = titleAnchor.attr('title').match(/ID\: ([0-9]+)/)
const title = titleAnchor
.text()
.split('\n')
.map((part) => part.trim())
.join(' ')
.trim()
const nobr = $(titleTd).children('nobr').get(0)
const typeSpan = $(nobr).children('span').get(0)
const typeAnchor = $(typeSpan).children('a').get(0)
const type = typeAnchor.attribs.title
results.push({
id: idMatch ? +idMatch[1] : undefined,
title,
type,
authors: $(fields.get(1))
.text()
.replace('[...]', '')
.split(';')
.map((a) => a.trim()),
publisher: $(fields.get(2)).text(),
year: $(fields.get(3)).text(),
language: $(fields.get(4)).text(),
pages: $(fields.get(5)).text(),
size: $(fields.get(6)).text(),
extension: $(fields.get(7)).text(),
mirros: fields
.last()
.children('a')
.toArray()
.map((a) => ({
link: a.attribs['href'],
title: $(a).attr('title')
}))
})
}
})
const paginator = matchPaginator(res.body)
if (paginator) {
console.log(paginator)
return {
data: results,
page,
totalPages: paginator.maxPage,
next: page + 1 <= paginator.maxPage,
prev: page - 1 > 0
}
}
return { resources: results }
}
const matchPaginator = (html) => {
const matches = html.match(
/new Paginator\("paginator_example_top", ([0-9]+), ([0-9]+), ([0-9]+)/
)
if (matches)
return {
perPage: +matches[2],
maxPage: +matches[1],
currentPage: +matches[3]
}
return null
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment