Created
March 6, 2026 07:08
-
-
Save codingstark-dev/d6d767e18e469ed79bfa50f715f90e6e to your computer and use it in GitHub Desktop.
SearXNG scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // import { convertHTMLSpecialChars } from "../extractor/html-to-content/html-utils.js"; | |
| // import { scrapeURL } from "../../index.js"; | |
| /** | |
| * Search Web via SearXNG metasearch of all major search engines. | |
| * Options are 10 search categories, recency, and how many | |
| * times to retry other domains if first time fails. | |
| * SearXNG is a free internet metasearch engine which aggregates results from | |
| * more than [180+ search sources](https://docs.searxng.org/user/configured_engines.html). | |
| * | |
| * [Searxng Overview](https://medium.com/@elmo92/search-in-peace-with-searxng-an-alternative-search-engine-that-keeps-your-searches-private-accd8cddd6fc) | |
| * [Searxng Installation Guide](https://github.com/searxng/searxng-docker/tree/master) | |
| * @param {string} query - The search query string. | |
| * @param {Object} [options] | |
| * @param {number} options.category default=0 - ["general", "news", "videos", "images", | |
| * "science", "map", "music", "it", "files", "social+media"] | |
| * @param {number} options.recency default=0 - ["", "day", "week", "month", "year"] | |
| * @param {string|boolean} options.privateSearxng default=null - Use your custom domain SearXNG | |
| * @param {number} options.maxRetries default=3 - Maximum number of retry attempts if the initial search fails. | |
| * @param {number} options.page default=1 - The page number to retrieve. | |
| * @param {string} options.proxy default=false - Use corsproxy.io to access in frontend JS | |
| * @returns {Promise<Array<{title: string, url: string, snippet: string, engines: string[]}>>} An array of search result objects. | |
| * @example const advancedResults = await searchWeb('Node.js', { | |
| * category: 2, | |
| * recency: 1, | |
| * maxRetries: 5 | |
| * }); | |
| * @category Search | |
| * @author [ai-research-agent (2024)](https://airesearch.js.org) | |
| * [Heiser, M., Tauber, A., Flament, A., et al. (2014-)](https://github.com/searxng/searxng/graphs/contributors) | |
| */ | |
| export function convertHTMLSpecialChars(str: string, unescape = true) { | |
| const entityMap: { [key: string]: string } = { | |
| "&": "&", | |
| "<": "<", | |
| ">": ">", | |
| '"': """, | |
| " ": " ", | |
| "'": "'", | |
| "`": "`", | |
| "¢": "¢", | |
| "£": "£", | |
| "¥": "¥", | |
| "€": "€", | |
| "©": "©", | |
| "®": "®", | |
| "™": "™", | |
| }; | |
| // Add numeric character references for Latin-1 Supplement characters | |
| for (let i = 160; i <= 255; i++) { | |
| entityMap[String.fromCharCode(i)] = `&#${i};`; | |
| } | |
| if (unescape) { | |
| // Create a reverse mapping for unescaping | |
| const reverseEntityMap = Object.fromEntries( | |
| Object.entries(entityMap).map(([k, v]) => [v, k]) | |
| ); | |
| // Add alternative representations | |
| reverseEntityMap["'"] = "'"; | |
| reverseEntityMap["«"] = "«"; | |
| reverseEntityMap["»"] = "»"; | |
| // Regex to match all types of HTML entities | |
| const entityRegex = new RegExp( | |
| Object.keys(reverseEntityMap).join("|") + "|&#[0-9]+;|&#x[0-9a-fA-F]+;", | |
| "g" | |
| ); | |
| str = str.replace(entityRegex, (entity: string) => { | |
| if (entity.startsWith("&#x")) { | |
| // Convert hexadecimal numeric character reference | |
| return String.fromCharCode(parseInt(entity.slice(3, -1), 16)); | |
| } else if (entity.startsWith("&#")) { | |
| // Convert decimal numeric character reference | |
| return String.fromCharCode(parseInt(entity.slice(2, -1), 10)); | |
| } | |
| // Convert named entity | |
| return reverseEntityMap[entity] || entity; | |
| }); | |
| str = str.replace(/[\u0300-\u036f]/g, ""); //special chars | |
| return str; | |
| } else { | |
| // Regex to match all characters that need to be escaped | |
| const charRegex = new RegExp(`[${Object.keys(entityMap).join("")}]`, "g"); | |
| return str.replace(charRegex, (char: string) => entityMap[char as keyof typeof entityMap]); | |
| } | |
| } | |
| export interface SearchOptions { | |
| category?: number; | |
| recency?: number; | |
| privateSearxng?: string | boolean; | |
| maxRetries?: number; | |
| page?: number; | |
| language?: string; | |
| proxy?: string | boolean; | |
| verbose?: boolean; | |
| } | |
| export async function searchWeb( | |
| query: string | number | boolean, | |
| options: SearchOptions = {} | |
| ) { | |
| const { | |
| category = 0, | |
| recency = 0, | |
| privateSearxng = null, | |
| maxRetries = 3, | |
| page = 1, | |
| language = "en-US", | |
| proxy = null, | |
| verbose = false, | |
| } = options; | |
| const CATEGORY_LIST = [ | |
| "general", | |
| "news", | |
| "videos", | |
| "images", | |
| "science", | |
| "map", | |
| "music", | |
| "it", | |
| "files", | |
| "social+media", | |
| ]; | |
| const RECENCY_LIST = ["", "day", "week", "month", "year"]; | |
| const SEARX_DOMAINS = [ | |
| "baresearch.org", | |
| "darmarit.org", | |
| "etsi.me", | |
| "fairsuch.net", | |
| "nogoo.me", | |
| "northboot.xyz", | |
| "nyc1.sx.ggtyler.dev", | |
| "ooglester.com", | |
| "opnxng.com", | |
| "paulgo.io", | |
| "priv.au", | |
| "s.trung.fun", | |
| "search.blitzw.in", | |
| "search.charliewhiskey.net", | |
| "search.citw.lgbt", | |
| "search.darkness.services", | |
| "search.datura.network", | |
| "search.gcomm.ch", | |
| "search.hbubli.cc", | |
| "search.im-in.space", | |
| "search.inetol.net", | |
| "search.leptons.xyz", | |
| "search.nadeko.net", | |
| "search.ngn.tf", | |
| "search.ononoki.org", | |
| "search.privacyredirect.com", | |
| "search.sapti.me", | |
| "search.rowie.at", | |
| "search.projectsegfau.lt", | |
| "search.tommy-tran.com", | |
| "searx.aleteoryx.me", | |
| "searx.ankha.ac", | |
| "searx.be", | |
| "searx.colbster937.dev", | |
| "searx.daetalytica.io", | |
| "searx.dresden.network", | |
| "searx.foss.family", | |
| "searx.hu", | |
| "searx.juancord.xyz", | |
| "searx.lunar.icu", | |
| "searx.mxchange.org", | |
| "searx.namejeff.xyz", | |
| "searx.oakleycord.dev", | |
| "searx.ro", | |
| "searx.sev.monster", | |
| "searx.thefloatinglab.world", | |
| "searx.tiekoetter.com", | |
| "searx.tuxcloud.net", | |
| "searx.work", | |
| "searx.zhenyapav.com", | |
| "searxng.hweeren.com", | |
| "searxng.online", | |
| "searxng.shreven.org", | |
| "searxng.site", | |
| "skyrimhater.com", | |
| "sx.ca.zorby.top", | |
| "sx.catgirl.cloud", | |
| "sx.thatxtreme.dev", | |
| "xo.wtf", | |
| "search.rhscz.eu", | |
| "search.bus-hit.me", | |
| "searx.rhscz.eu", | |
| "search.indst.eu", | |
| "searxng.ch", | |
| "www.gruble.de", | |
| "kantan.cat", | |
| "search.canine.tools", | |
| "search.mdosch.de", | |
| "search.getcobalt.org", | |
| "searx.electroncash.de", | |
| "searx.perennialte.ch", | |
| "searxng.brihx.fr", | |
| "search.fredix.xyz", | |
| "seek.fyi", | |
| "search.nordh.tech", | |
| "darmarit.org/searx", | |
| "searx.ox2.fr", | |
| "s.mble.dk", | |
| "search.einfachzocken.eu", | |
| "searx.mv-software.de", | |
| "searx.nobulart.com", | |
| ]; | |
| //select a random domain if none is provided | |
| const searchDomain = | |
| privateSearxng || | |
| "https://" + | |
| SEARX_DOMAINS[Math.floor(Math.random() * SEARX_DOMAINS.length)]; | |
| const categoryName = CATEGORY_LIST[category]; // Using the first category as default | |
| const timeRangeName = RECENCY_LIST[recency]; // Using the first time range as default | |
| var url = | |
| `${searchDomain}/search?q=${encodeURIComponent(query)}` + | |
| `&category_${categoryName}=1&language=${language}&time_range=` + | |
| `${timeRangeName}&safesearch=0&pageno=${page}`; | |
| if (privateSearxng) url += "&format=json"; | |
| //on cloudflare to avoid "Too many redirects" change SSL mode to Full | |
| if (proxy && !privateSearxng) url = proxy + url; | |
| // console.log(url); | |
| try { | |
| const resultHTML = await ( | |
| await fetch(url, { | |
| headers: { | |
| "accept-language": language + ",en;q=0.9", | |
| }, | |
| }) | |
| ).text(); | |
| if (privateSearxng) { | |
| if (!resultHTML.startsWith("{")) return { error: 1 }; | |
| //todo use public | |
| var { results, suggestions, infoboxes } = JSON.parse(resultHTML); | |
| results.forEach((result: { url: string }) => { | |
| result.url = result.url.replace(/&/g, "&"); | |
| }); | |
| results = results.map( | |
| ({ | |
| title, | |
| url, | |
| content, | |
| score, | |
| }: { | |
| title: string; | |
| url: string; | |
| content: string; | |
| score: any; | |
| }) => { | |
| return { title, url, snippet: content, score }; | |
| } | |
| ); | |
| // console.log(resultHTML); | |
| return results; | |
| // return {results, suggestions}; | |
| } | |
| results = []; | |
| const resultRegex = /<article class="result[^>]*>[\s\S]*?<\/article>/g; | |
| const titleUrlRegex = /<h3><a href="([^"]*)"[^>]*>(.*?)<\/a><\/h3>/; | |
| const snippetRegex = /<p class="content">\s*(.*?)\s*<\/p>/; | |
| const enginesRegex = /<span>(bing|duckduckgo|yahoo|google)<\/span>/g; | |
| const linksRegex = | |
| /<a href="([^"]*)" class="(cache_link|proxyfied_link)"[^>]*>(cached|proxied)<\/a>/g; | |
| let match; | |
| while ((match = resultRegex.exec(resultHTML)) !== null) { | |
| const resultHtml = match[0]; | |
| const titleUrlMatch = titleUrlRegex.exec(resultHtml); | |
| const snippetMatch = snippetRegex.exec(resultHtml); | |
| if (titleUrlMatch && titleUrlMatch[1] && titleUrlMatch[2]) { | |
| const url = convertHTMLSpecialChars(titleUrlMatch[1]); | |
| let title = titleUrlMatch[2].replace(/<\/?[^>]+(>|$)/g, ""); | |
| let snippet = snippetMatch | |
| ? snippetMatch[1].replace(/<\/?[^>]+(>|$)/g, "") | |
| : ""; | |
| let engines = []; | |
| let engineMatch; | |
| while ((engineMatch = enginesRegex.exec(resultHtml)) !== null) { | |
| engines.push(engineMatch[1]); | |
| } | |
| let cached = null; | |
| let linkMatch; | |
| // while ((linkMatch = linksRegex.exec(resultHtml)) !== null) { | |
| // cached = linkMatch[1]; | |
| // } | |
| title = convertHTMLSpecialChars(title); | |
| snippet = convertHTMLSpecialChars(snippet); | |
| // if (!url.includes(".de/")) | |
| results.push({ title, url, snippet }); | |
| } | |
| } | |
| if (results.length === 0 && maxRetries > 0) { | |
| results = await searchWeb(query, { | |
| ...options, | |
| maxRetries: maxRetries - 1, | |
| // useProxy: true | |
| }); | |
| } | |
| //filter out url that end with .de | |
| // results = results.filter((result) => !result.url.includes(".de/")); | |
| return results; | |
| // } catch (error) { | |
| // console.error(`Error fetching search results: ${error.message}`); | |
| // return []; | |
| // } | |
| } catch (error) { | |
| if ((error as Error).message === "ERR_TLS_CERT_ALTNAME_INVALID") { | |
| console.error(`TLS certificate error: ${(error as Error).message}`); | |
| } else if ((error as Error).message.includes("Unable to connect")) { | |
| console.error(`Network error: ${(error as Error).message}`); | |
| } else { | |
| console.error( | |
| `Error fetching search results: ${(error as Error).message}` | |
| ); | |
| } | |
| if (verbose) { | |
| console.error(`Failed URL: ${url}`); | |
| } | |
| if (maxRetries > 0) { | |
| // console.log(`Retrying... (${maxRetries} attempts left)`); | |
| return await searchWeb(query, { ...options, maxRetries: maxRetries - 1 }); | |
| } | |
| return []; | |
| } | |
| } | |
| // searchWeb("tata motors new cars", { | |
| // category: 0, | |
| // recency: 0, | |
| // maxRetries: 50, | |
| // }).then(console.log); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment