Skip to content

Instantly share code, notes, and snippets.

@codingstark-dev
Created March 6, 2026 07:08
Show Gist options
  • Select an option

  • Save codingstark-dev/d6d767e18e469ed79bfa50f715f90e6e to your computer and use it in GitHub Desktop.

Select an option

Save codingstark-dev/d6d767e18e469ed79bfa50f715f90e6e to your computer and use it in GitHub Desktop.
SearXNG scraper
// import { convertHTMLSpecialChars } from "../extractor/html-to-content/html-utils.js";
// import { scrapeURL } from "../../index.js";
/**
* Search Web via SearXNG metasearch of all major search engines.
* Options are 10 search categories, recency, and how many
* times to retry other domains if first time fails.
* SearXNG is a free internet metasearch engine which aggregates results from
* more than [180+ search sources](https://docs.searxng.org/user/configured_engines.html).
*
* [Searxng Overview](https://medium.com/@elmo92/search-in-peace-with-searxng-an-alternative-search-engine-that-keeps-your-searches-private-accd8cddd6fc)
* [Searxng Installation Guide](https://github.com/searxng/searxng-docker/tree/master)
* @param {string} query - The search query string.
* @param {Object} [options]
* @param {number} options.category default=0 - ["general", "news", "videos", "images",
* "science", "map", "music", "it", "files", "social+media"]
* @param {number} options.recency default=0 - ["", "day", "week", "month", "year"]
* @param {string|boolean} options.privateSearxng default=null - Use your custom domain SearXNG
* @param {number} options.maxRetries default=3 - Maximum number of retry attempts if the initial search fails.
* @param {number} options.page default=1 - The page number to retrieve.
* @param {string} options.proxy default=false - Use corsproxy.io to access in frontend JS
* @returns {Promise<Array<{title: string, url: string, snippet: string, engines: string[]}>>} An array of search result objects.
* @example const advancedResults = await searchWeb('Node.js', {
* category: 2,
* recency: 1,
* maxRetries: 5
* });
* @category Search
* @author [ai-research-agent (2024)](https://airesearch.js.org)
* [Heiser, M., Tauber, A., Flament, A., et al. (2014-)](https://github.com/searxng/searxng/graphs/contributors)
*/
export function convertHTMLSpecialChars(str: string, unescape = true) {
const entityMap: { [key: string]: string } = {
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
'"': "&quot;",
" ": "&nbsp;",
"'": "&#39;",
"`": "&#96;",
"¢": "&cent;",
"£": "&pound;",
"¥": "&yen;",
"€": "&euro;",
"©": "&copy;",
"®": "&reg;",
"™": "&trade;",
};
// Add numeric character references for Latin-1 Supplement characters
for (let i = 160; i <= 255; i++) {
entityMap[String.fromCharCode(i)] = `&#${i};`;
}
if (unescape) {
// Create a reverse mapping for unescaping
const reverseEntityMap = Object.fromEntries(
Object.entries(entityMap).map(([k, v]) => [v, k])
);
// Add alternative representations
reverseEntityMap["&apos;"] = "'";
reverseEntityMap["&laquo;"] = "«";
reverseEntityMap["&raquo;"] = "»";
// Regex to match all types of HTML entities
const entityRegex = new RegExp(
Object.keys(reverseEntityMap).join("|") + "|&#[0-9]+;|&#x[0-9a-fA-F]+;",
"g"
);
str = str.replace(entityRegex, (entity: string) => {
if (entity.startsWith("&#x")) {
// Convert hexadecimal numeric character reference
return String.fromCharCode(parseInt(entity.slice(3, -1), 16));
} else if (entity.startsWith("&#")) {
// Convert decimal numeric character reference
return String.fromCharCode(parseInt(entity.slice(2, -1), 10));
}
// Convert named entity
return reverseEntityMap[entity] || entity;
});
str = str.replace(/[\u0300-\u036f]/g, ""); //special chars
return str;
} else {
// Regex to match all characters that need to be escaped
const charRegex = new RegExp(`[${Object.keys(entityMap).join("")}]`, "g");
return str.replace(charRegex, (char: string) => entityMap[char as keyof typeof entityMap]);
}
}
export interface SearchOptions {
category?: number;
recency?: number;
privateSearxng?: string | boolean;
maxRetries?: number;
page?: number;
language?: string;
proxy?: string | boolean;
verbose?: boolean;
}
export async function searchWeb(
query: string | number | boolean,
options: SearchOptions = {}
) {
const {
category = 0,
recency = 0,
privateSearxng = null,
maxRetries = 3,
page = 1,
language = "en-US",
proxy = null,
verbose = false,
} = options;
const CATEGORY_LIST = [
"general",
"news",
"videos",
"images",
"science",
"map",
"music",
"it",
"files",
"social+media",
];
const RECENCY_LIST = ["", "day", "week", "month", "year"];
const SEARX_DOMAINS = [
"baresearch.org",
"darmarit.org",
"etsi.me",
"fairsuch.net",
"nogoo.me",
"northboot.xyz",
"nyc1.sx.ggtyler.dev",
"ooglester.com",
"opnxng.com",
"paulgo.io",
"priv.au",
"s.trung.fun",
"search.blitzw.in",
"search.charliewhiskey.net",
"search.citw.lgbt",
"search.darkness.services",
"search.datura.network",
"search.gcomm.ch",
"search.hbubli.cc",
"search.im-in.space",
"search.inetol.net",
"search.leptons.xyz",
"search.nadeko.net",
"search.ngn.tf",
"search.ononoki.org",
"search.privacyredirect.com",
"search.sapti.me",
"search.rowie.at",
"search.projectsegfau.lt",
"search.tommy-tran.com",
"searx.aleteoryx.me",
"searx.ankha.ac",
"searx.be",
"searx.colbster937.dev",
"searx.daetalytica.io",
"searx.dresden.network",
"searx.foss.family",
"searx.hu",
"searx.juancord.xyz",
"searx.lunar.icu",
"searx.mxchange.org",
"searx.namejeff.xyz",
"searx.oakleycord.dev",
"searx.ro",
"searx.sev.monster",
"searx.thefloatinglab.world",
"searx.tiekoetter.com",
"searx.tuxcloud.net",
"searx.work",
"searx.zhenyapav.com",
"searxng.hweeren.com",
"searxng.online",
"searxng.shreven.org",
"searxng.site",
"skyrimhater.com",
"sx.ca.zorby.top",
"sx.catgirl.cloud",
"sx.thatxtreme.dev",
"xo.wtf",
"search.rhscz.eu",
"search.bus-hit.me",
"searx.rhscz.eu",
"search.indst.eu",
"searxng.ch",
"www.gruble.de",
"kantan.cat",
"search.canine.tools",
"search.mdosch.de",
"search.getcobalt.org",
"searx.electroncash.de",
"searx.perennialte.ch",
"searxng.brihx.fr",
"search.fredix.xyz",
"seek.fyi",
"search.nordh.tech",
"darmarit.org/searx",
"searx.ox2.fr",
"s.mble.dk",
"search.einfachzocken.eu",
"searx.mv-software.de",
"searx.nobulart.com",
];
//select a random domain if none is provided
const searchDomain =
privateSearxng ||
"https://" +
SEARX_DOMAINS[Math.floor(Math.random() * SEARX_DOMAINS.length)];
const categoryName = CATEGORY_LIST[category]; // Using the first category as default
const timeRangeName = RECENCY_LIST[recency]; // Using the first time range as default
var url =
`${searchDomain}/search?q=${encodeURIComponent(query)}` +
`&category_${categoryName}=1&language=${language}&time_range=` +
`${timeRangeName}&safesearch=0&pageno=${page}`;
if (privateSearxng) url += "&format=json";
//on cloudflare to avoid "Too many redirects" change SSL mode to Full
if (proxy && !privateSearxng) url = proxy + url;
// console.log(url);
try {
const resultHTML = await (
await fetch(url, {
headers: {
"accept-language": language + ",en;q=0.9",
},
})
).text();
if (privateSearxng) {
if (!resultHTML.startsWith("{")) return { error: 1 };
//todo use public
var { results, suggestions, infoboxes } = JSON.parse(resultHTML);
results.forEach((result: { url: string }) => {
result.url = result.url.replace(/&amp;/g, "&");
});
results = results.map(
({
title,
url,
content,
score,
}: {
title: string;
url: string;
content: string;
score: any;
}) => {
return { title, url, snippet: content, score };
}
);
// console.log(resultHTML);
return results;
// return {results, suggestions};
}
results = [];
const resultRegex = /<article class="result[^>]*>[\s\S]*?<\/article>/g;
const titleUrlRegex = /<h3><a href="([^"]*)"[^>]*>(.*?)<\/a><\/h3>/;
const snippetRegex = /<p class="content">\s*(.*?)\s*<\/p>/;
const enginesRegex = /<span>(bing|duckduckgo|yahoo|google)<\/span>/g;
const linksRegex =
/<a href="([^"]*)" class="(cache_link|proxyfied_link)"[^>]*>(cached|proxied)<\/a>/g;
let match;
while ((match = resultRegex.exec(resultHTML)) !== null) {
const resultHtml = match[0];
const titleUrlMatch = titleUrlRegex.exec(resultHtml);
const snippetMatch = snippetRegex.exec(resultHtml);
if (titleUrlMatch && titleUrlMatch[1] && titleUrlMatch[2]) {
const url = convertHTMLSpecialChars(titleUrlMatch[1]);
let title = titleUrlMatch[2].replace(/<\/?[^>]+(>|$)/g, "");
let snippet = snippetMatch
? snippetMatch[1].replace(/<\/?[^>]+(>|$)/g, "")
: "";
let engines = [];
let engineMatch;
while ((engineMatch = enginesRegex.exec(resultHtml)) !== null) {
engines.push(engineMatch[1]);
}
let cached = null;
let linkMatch;
// while ((linkMatch = linksRegex.exec(resultHtml)) !== null) {
// cached = linkMatch[1];
// }
title = convertHTMLSpecialChars(title);
snippet = convertHTMLSpecialChars(snippet);
// if (!url.includes(".de/"))
results.push({ title, url, snippet });
}
}
if (results.length === 0 && maxRetries > 0) {
results = await searchWeb(query, {
...options,
maxRetries: maxRetries - 1,
// useProxy: true
});
}
//filter out url that end with .de
// results = results.filter((result) => !result.url.includes(".de/"));
return results;
// } catch (error) {
// console.error(`Error fetching search results: ${error.message}`);
// return [];
// }
} catch (error) {
if ((error as Error).message === "ERR_TLS_CERT_ALTNAME_INVALID") {
console.error(`TLS certificate error: ${(error as Error).message}`);
} else if ((error as Error).message.includes("Unable to connect")) {
console.error(`Network error: ${(error as Error).message}`);
} else {
console.error(
`Error fetching search results: ${(error as Error).message}`
);
}
if (verbose) {
console.error(`Failed URL: ${url}`);
}
if (maxRetries > 0) {
// console.log(`Retrying... (${maxRetries} attempts left)`);
return await searchWeb(query, { ...options, maxRetries: maxRetries - 1 });
}
return [];
}
}
// searchWeb("tata motors new cars", {
// category: 0,
// recency: 0,
// maxRetries: 50,
// }).then(console.log);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment