codingstark-dev/SearXNG.ts

## SearXNG.ts
// import { convertHTMLSpecialChars } from "../extractor/html-to-content/html-utils.js";
// import { scrapeURL } from "../../index.js";
/**
 * Search Web via SearXNG metasearch of all major search engines.
 * Options are 10 search categories, recency, and how many
 * times to retry other domains if first time fails.
 * SearXNG is a free internet metasearch engine which aggregates results from
 *  more than [180+ search sources](https://docs.searxng.org/user/configured_engines.html).
 *
 * [Searxng Overview](https://medium.com/@elmo92/search-in-peace-with-searxng-an-alternative-search-engine-that-keeps-your-searches-private-accd8cddd6fc)
 * [Searxng Installation Guide](https://github.com/searxng/searxng-docker/tree/master)
 * @param {string} query - The search query string.
 * @param {Object} [options]
 * @param {number} options.category default=0 - ["general", "news", "videos", "images",
 *  "science", "map", "music", "it", "files", "social+media"]
 * @param {number} options.recency default=0 - ["", "day", "week", "month", "year"]
 * @param {string|boolean} options.privateSearxng default=null - Use your custom domain SearXNG
 * @param {number} options.maxRetries default=3 - Maximum number of retry attempts if the initial search fails.
 * @param {number} options.page default=1 - The page number to retrieve.
 * @param {string} options.proxy default=false - Use corsproxy.io to access in frontend JS
 * @returns {Promise<Array<{title: string, url: string, snippet: string, engines: string[]}>>} An array of search result objects.
 * @example  const advancedResults = await searchWeb('Node.js', {
 *   category: 2,
 *   recency: 1,
 *   maxRetries: 5
 * });
 * @category Search
 * @author [ai-research-agent (2024)](https://airesearch.js.org)
 * [Heiser, M., Tauber, A., Flament, A., et al. (2014-)](https://github.com/searxng/searxng/graphs/contributors)
 */
export function convertHTMLSpecialChars(str: string, unescape = true) {
  const entityMap: { [key: string]: string } = {
      "&": "&amp;",
      "<": "&lt;",
      ">": "&gt;",
      '"': "&quot;",
      " ": "&nbsp;",
      "'": "&#39;",
      "`": "&#96;",
      "¢": "&cent;",
      "£": "&pound;",
      "¥": "&yen;",
      "€": "&euro;",
      "©": "&copy;",
      "®": "&reg;",
      "™": "&trade;",
    };

  // Add numeric character references for Latin-1 Supplement characters
  for (let i = 160; i <= 255; i++) {
    entityMap[String.fromCharCode(i)] = `&#${i};`;
  }

  if (unescape) {
    // Create a reverse mapping for unescaping
    const reverseEntityMap = Object.fromEntries(
      Object.entries(entityMap).map(([k, v]) => [v, k])
    );

    // Add alternative representations
    reverseEntityMap["&apos;"] = "'";
    reverseEntityMap["&laquo;"] = "«";
    reverseEntityMap["&raquo;"] = "»";

    // Regex to match all types of HTML entities
    const entityRegex = new RegExp(
      Object.keys(reverseEntityMap).join("|") + "|&#[0-9]+;|&#x[0-9a-fA-F]+;",
      "g"
    );

    str = str.replace(entityRegex, (entity: string) => {
      if (entity.startsWith("&#x")) {
        // Convert hexadecimal numeric character reference
        return String.fromCharCode(parseInt(entity.slice(3, -1), 16));
      } else if (entity.startsWith("&#")) {
        // Convert decimal numeric character reference
        return String.fromCharCode(parseInt(entity.slice(2, -1), 10));
      }
      // Convert named entity
      return reverseEntityMap[entity] || entity;
    });

    str = str.replace(/[\u0300-\u036f]/g, ""); //special chars

    return str;
  } else {
    // Regex to match all characters that need to be escaped
    const charRegex = new RegExp(`[${Object.keys(entityMap).join("")}]`, "g");
    return str.replace(charRegex, (char: string) => entityMap[char as keyof typeof entityMap]);
  }
}

export interface SearchOptions {
  category?: number;
  recency?: number;
  privateSearxng?: string | boolean;
  maxRetries?: number;
  page?: number;
  language?: string;
  proxy?: string | boolean;
  verbose?: boolean;
}

export async function searchWeb(
  query: string | number | boolean,
  options: SearchOptions = {}
) {
  const {
    category = 0,
    recency = 0,
    privateSearxng = null,
    maxRetries = 3,
    page = 1,
    language = "en-US",
    proxy = null,
    verbose = false,
  } = options;

  const CATEGORY_LIST = [
    "general",
    "news",
    "videos",
    "images",
    "science",
    "map",
    "music",
    "it",
    "files",
    "social+media",
  ];
  const RECENCY_LIST = ["", "day", "week", "month", "year"];

  const SEARX_DOMAINS = [
    "baresearch.org",
    "darmarit.org",
    "etsi.me",
    "fairsuch.net",
    "nogoo.me",
    "northboot.xyz",
    "nyc1.sx.ggtyler.dev",
    "ooglester.com",
    "opnxng.com",
    "paulgo.io",
    "priv.au",
    "s.trung.fun",
    "search.blitzw.in",
    "search.charliewhiskey.net",
    "search.citw.lgbt",
    "search.darkness.services",
    "search.datura.network",
    "search.gcomm.ch",
    "search.hbubli.cc",
    "search.im-in.space",
    "search.inetol.net",
    "search.leptons.xyz",
    "search.nadeko.net",
    "search.ngn.tf",
    "search.ononoki.org",
    "search.privacyredirect.com",
    "search.sapti.me",
    "search.rowie.at",
    "search.projectsegfau.lt",
    "search.tommy-tran.com",
    "searx.aleteoryx.me",
    "searx.ankha.ac",
    "searx.be",
    "searx.colbster937.dev",
    "searx.daetalytica.io",
    "searx.dresden.network",
    "searx.foss.family",
    "searx.hu",
    "searx.juancord.xyz",
    "searx.lunar.icu",
    "searx.mxchange.org",
    "searx.namejeff.xyz",
    "searx.oakleycord.dev",
    "searx.ro",
    "searx.sev.monster",
    "searx.thefloatinglab.world",
    "searx.tiekoetter.com",
    "searx.tuxcloud.net",
    "searx.work",
    "searx.zhenyapav.com",
    "searxng.hweeren.com",
    "searxng.online",
    "searxng.shreven.org",
    "searxng.site",
    "skyrimhater.com",
    "sx.ca.zorby.top",
    "sx.catgirl.cloud",
    "sx.thatxtreme.dev",
    "xo.wtf",
    "search.rhscz.eu",
    "search.bus-hit.me",
    "searx.rhscz.eu",
    "search.indst.eu",
    "searxng.ch",
    "www.gruble.de",
    "kantan.cat",
    "search.canine.tools",
    "search.mdosch.de",
    "search.getcobalt.org",
    "searx.electroncash.de",
    "searx.perennialte.ch",
    "searxng.brihx.fr",
    "search.fredix.xyz",
    "seek.fyi",
    "search.nordh.tech",
    "darmarit.org/searx",
    "searx.ox2.fr",
    "s.mble.dk",
    "search.einfachzocken.eu",
    "searx.mv-software.de",
    "searx.nobulart.com",
  ];

  //select a random domain if none is provided
  const searchDomain =
    privateSearxng ||
    "https://" +
      SEARX_DOMAINS[Math.floor(Math.random() * SEARX_DOMAINS.length)];

  const categoryName = CATEGORY_LIST[category]; // Using the first category as default
  const timeRangeName = RECENCY_LIST[recency]; // Using the first time range as default

  var url =
    `${searchDomain}/search?q=${encodeURIComponent(query)}` +
    `&category_${categoryName}=1&language=${language}&time_range=` +
    `${timeRangeName}&safesearch=0&pageno=${page}`;

  if (privateSearxng) url += "&format=json";

  //on cloudflare to avoid "Too many redirects" change SSL mode to Full

  if (proxy && !privateSearxng) url = proxy + url;

  // console.log(url);
  try {
    const resultHTML = await (
      await fetch(url, {
        headers: {
          "accept-language": language + ",en;q=0.9",
        },
      })
    ).text();

    if (privateSearxng) {
      if (!resultHTML.startsWith("{")) return { error: 1 };
      //todo use public

      var { results, suggestions, infoboxes } = JSON.parse(resultHTML);

      results.forEach((result: { url: string }) => {
        result.url = result.url.replace(/&amp;/g, "&");
      });

      results = results.map(
        ({
          title,
          url,
          content,
          score,
        }: {
          title: string;
          url: string;
          content: string;
          score: any;
        }) => {
          return { title, url, snippet: content, score };
        }
      );

      // console.log(resultHTML);

      return results;
      // return {results, suggestions};
    }

    results = [];
    const resultRegex = /<article class="result[^>]*>[\s\S]*?<\/article>/g;
    const titleUrlRegex = /<h3><a href="([^"]*)"[^>]*>(.*?)<\/a><\/h3>/;
    const snippetRegex = /<p class="content">\s*(.*?)\s*<\/p>/;
    const enginesRegex = /<span>(bing|duckduckgo|yahoo|google)<\/span>/g;
    const linksRegex =
      /<a href="([^"]*)" class="(cache_link|proxyfied_link)"[^>]*>(cached|proxied)<\/a>/g;

    let match;
    while ((match = resultRegex.exec(resultHTML)) !== null) {
      const resultHtml = match[0];
      const titleUrlMatch = titleUrlRegex.exec(resultHtml);
      const snippetMatch = snippetRegex.exec(resultHtml);

      if (titleUrlMatch && titleUrlMatch[1] && titleUrlMatch[2]) {
        const url = convertHTMLSpecialChars(titleUrlMatch[1]);
        let title = titleUrlMatch[2].replace(/<\/?[^>]+(>|$)/g, "");
        let snippet = snippetMatch
          ? snippetMatch[1].replace(/<\/?[^>]+(>|$)/g, "")
          : "";

        let engines = [];
        let engineMatch;
        while ((engineMatch = enginesRegex.exec(resultHtml)) !== null) {
          engines.push(engineMatch[1]);
        }

        let cached = null;
        let linkMatch;
        // while ((linkMatch = linksRegex.exec(resultHtml)) !== null) {
        //   cached = linkMatch[1];
        // }

        title = convertHTMLSpecialChars(title);
        snippet = convertHTMLSpecialChars(snippet);
        // if (!url.includes(".de/"))
        results.push({ title, url, snippet });
      }
    }

    if (results.length === 0 && maxRetries > 0) {
      results = await searchWeb(query, {
        ...options,
        maxRetries: maxRetries - 1,
        //  useProxy: true
      });
    }

    //filter out url that end with .de
    // results = results.filter((result) => !result.url.includes(".de/"));

    return results;
    // } catch (error) {
    //   console.error(`Error fetching search results: ${error.message}`);
    //   return [];
    // }
  } catch (error) {
    if ((error as Error).message === "ERR_TLS_CERT_ALTNAME_INVALID") {
      console.error(`TLS certificate error: ${(error as Error).message}`);
    } else if ((error as Error).message.includes("Unable to connect")) {
      console.error(`Network error: ${(error as Error).message}`);
    } else {
      console.error(
        `Error fetching search results: ${(error as Error).message}`
      );
    }
    if (verbose) {
      console.error(`Failed URL: ${url}`);
    }
    if (maxRetries > 0) {
      // console.log(`Retrying... (${maxRetries} attempts left)`);
      return await searchWeb(query, { ...options, maxRetries: maxRetries - 1 });
    }
    return [];
  }
}


// searchWeb("tata motors new cars", {
//   category: 0,
//   recency: 0,
//   maxRetries: 50,
// }).then(console.log);
	// import { convertHTMLSpecialChars } from "../extractor/html-to-content/html-utils.js";
	// import { scrapeURL } from "../../index.js";
	/**
	* Search Web via SearXNG metasearch of all major search engines.
	* Options are 10 search categories, recency, and how many
	* times to retry other domains if first time fails.
	* SearXNG is a free internet metasearch engine which aggregates results from
	* more than [180+ search sources](https://docs.searxng.org/user/configured_engines.html).
	*
	* [Searxng Overview](https://medium.com/@elmo92/search-in-peace-with-searxng-an-alternative-search-engine-that-keeps-your-searches-private-accd8cddd6fc)
	* [Searxng Installation Guide](https://github.com/searxng/searxng-docker/tree/master)
	* @param {string} query - The search query string.
	* @param {Object} [options]
	* @param {number} options.category default=0 - ["general", "news", "videos", "images",
	* "science", "map", "music", "it", "files", "social+media"]
	* @param {number} options.recency default=0 - ["", "day", "week", "month", "year"]
	* @param {string\|boolean} options.privateSearxng default=null - Use your custom domain SearXNG
	* @param {number} options.maxRetries default=3 - Maximum number of retry attempts if the initial search fails.
	* @param {number} options.page default=1 - The page number to retrieve.
	* @param {string} options.proxy default=false - Use corsproxy.io to access in frontend JS
	* @returns {Promise<Array<{title: string, url: string, snippet: string, engines: string[]}>>} An array of search result objects.
	* @example const advancedResults = await searchWeb('Node.js', {
	* category: 2,
	* recency: 1,
	* maxRetries: 5
	* });
	* @category Search
	* @author [ai-research-agent (2024)](https://airesearch.js.org)
	* [Heiser, M., Tauber, A., Flament, A., et al. (2014-)](https://github.com/searxng/searxng/graphs/contributors)
	*/
	export function convertHTMLSpecialChars(str: string, unescape = true) {
	const entityMap: { [key: string]: string } = {
	"&": "&",
	"<": "<",
	">": ">",
	'"': """,
	" ": " ",
	"'": "'",
	"`": "`",
	"¢": "¢",
	"£": "£",
	"¥": "¥",
	"€": "€",
	"©": "©",
	"®": "®",
	"™": "™",
	};

	// Add numeric character references for Latin-1 Supplement characters
	for (let i = 160; i <= 255; i++) {
	entityMap[String.fromCharCode(i)] = `&#${i};`;
	}

	if (unescape) {
	// Create a reverse mapping for unescaping
	const reverseEntityMap = Object.fromEntries(
	Object.entries(entityMap).map(([k, v]) => [v, k])
	);

	// Add alternative representations
	reverseEntityMap["'"] = "'";
	reverseEntityMap["«"] = "«";
	reverseEntityMap["»"] = "»";

	// Regex to match all types of HTML entities
	const entityRegex = new RegExp(
	Object.keys(reverseEntityMap).join("\|") + "\|&#[0-9]+;\|&#x[0-9a-fA-F]+;",
	"g"
	);

	str = str.replace(entityRegex, (entity: string) => {
	if (entity.startsWith("&#x")) {
	// Convert hexadecimal numeric character reference
	return String.fromCharCode(parseInt(entity.slice(3, -1), 16));
	} else if (entity.startsWith("&#")) {
	// Convert decimal numeric character reference
	return String.fromCharCode(parseInt(entity.slice(2, -1), 10));
	}
	// Convert named entity
	return reverseEntityMap[entity] \|\| entity;
	});

	str = str.replace(/[\u0300-\u036f]/g, ""); //special chars

	return str;
	} else {
	// Regex to match all characters that need to be escaped
	const charRegex = new RegExp(`[${Object.keys(entityMap).join("")}]`, "g");
	return str.replace(charRegex, (char: string) => entityMap[char as keyof typeof entityMap]);
	}
	}

	export interface SearchOptions {
	category?: number;
	recency?: number;
	privateSearxng?: string \| boolean;
	maxRetries?: number;
	page?: number;
	language?: string;
	proxy?: string \| boolean;
	verbose?: boolean;
	}

	export async function searchWeb(
	query: string \| number \| boolean,
	options: SearchOptions = {}
	) {
	const {
	category = 0,
	recency = 0,
	privateSearxng = null,
	maxRetries = 3,
	page = 1,
	language = "en-US",
	proxy = null,
	verbose = false,
	} = options;

	const CATEGORY_LIST = [
	"general",
	"news",
	"videos",
	"images",
	"science",
	"map",
	"music",
	"it",
	"files",
	"social+media",
	];
	const RECENCY_LIST = ["", "day", "week", "month", "year"];

	const SEARX_DOMAINS = [
	"baresearch.org",
	"darmarit.org",
	"etsi.me",
	"fairsuch.net",
	"nogoo.me",
	"northboot.xyz",
	"nyc1.sx.ggtyler.dev",
	"ooglester.com",
	"opnxng.com",
	"paulgo.io",
	"priv.au",
	"s.trung.fun",
	"search.blitzw.in",
	"search.charliewhiskey.net",
	"search.citw.lgbt",
	"search.darkness.services",
	"search.datura.network",
	"search.gcomm.ch",
	"search.hbubli.cc",
	"search.im-in.space",
	"search.inetol.net",
	"search.leptons.xyz",
	"search.nadeko.net",
	"search.ngn.tf",
	"search.ononoki.org",
	"search.privacyredirect.com",
	"search.sapti.me",
	"search.rowie.at",
	"search.projectsegfau.lt",
	"search.tommy-tran.com",
	"searx.aleteoryx.me",
	"searx.ankha.ac",
	"searx.be",
	"searx.colbster937.dev",
	"searx.daetalytica.io",
	"searx.dresden.network",
	"searx.foss.family",
	"searx.hu",
	"searx.juancord.xyz",
	"searx.lunar.icu",
	"searx.mxchange.org",
	"searx.namejeff.xyz",
	"searx.oakleycord.dev",
	"searx.ro",
	"searx.sev.monster",
	"searx.thefloatinglab.world",
	"searx.tiekoetter.com",
	"searx.tuxcloud.net",
	"searx.work",
	"searx.zhenyapav.com",
	"searxng.hweeren.com",
	"searxng.online",
	"searxng.shreven.org",
	"searxng.site",
	"skyrimhater.com",
	"sx.ca.zorby.top",
	"sx.catgirl.cloud",
	"sx.thatxtreme.dev",
	"xo.wtf",
	"search.rhscz.eu",
	"search.bus-hit.me",
	"searx.rhscz.eu",
	"search.indst.eu",
	"searxng.ch",
	"www.gruble.de",
	"kantan.cat",
	"search.canine.tools",
	"search.mdosch.de",
	"search.getcobalt.org",
	"searx.electroncash.de",
	"searx.perennialte.ch",
	"searxng.brihx.fr",
	"search.fredix.xyz",
	"seek.fyi",
	"search.nordh.tech",
	"darmarit.org/searx",
	"searx.ox2.fr",
	"s.mble.dk",
	"search.einfachzocken.eu",
	"searx.mv-software.de",
	"searx.nobulart.com",
	];

	//select a random domain if none is provided
	const searchDomain =
	privateSearxng \|\|
	"https://" +
	SEARX_DOMAINS[Math.floor(Math.random() * SEARX_DOMAINS.length)];

	const categoryName = CATEGORY_LIST[category]; // Using the first category as default
	const timeRangeName = RECENCY_LIST[recency]; // Using the first time range as default

	var url =
	`${searchDomain}/search?q=${encodeURIComponent(query)}` +
	`&category_${categoryName}=1&language=${language}&time_range=` +
	`${timeRangeName}&safesearch=0&pageno=${page}`;

	if (privateSearxng) url += "&format=json";

	//on cloudflare to avoid "Too many redirects" change SSL mode to Full

	if (proxy && !privateSearxng) url = proxy + url;

	// console.log(url);
	try {
	const resultHTML = await (
	await fetch(url, {
	headers: {
	"accept-language": language + ",en;q=0.9",
	},
	})
	).text();

	if (privateSearxng) {
	if (!resultHTML.startsWith("{")) return { error: 1 };
	//todo use public

	var { results, suggestions, infoboxes } = JSON.parse(resultHTML);

	results.forEach((result: { url: string }) => {
	result.url = result.url.replace(/&/g, "&");
	});

	results = results.map(
	({
	title,
	url,
	content,
	score,
	}: {
	title: string;
	url: string;
	content: string;
	score: any;
	}) => {
	return { title, url, snippet: content, score };
	}
	);

	// console.log(resultHTML);

	return results;
	// return {results, suggestions};
	}

	results = [];
	const resultRegex = /<article class="result[^>]>[\s\S]?<\/article>/g;
	const titleUrlRegex = /<h3><a href="([^"])"[^>]>(.*?)<\/a><\/h3>/;
	const snippetRegex = /<p class="content">\s(.?)\s*<\/p>/;
	const enginesRegex = /<span>(bing\|duckduckgo\|yahoo\|google)<\/span>/g;
	const linksRegex =
	/<a href="([^"])" class="(cache_link\|proxyfied_link)"[^>]>(cached\|proxied)<\/a>/g;

	let match;
	while ((match = resultRegex.exec(resultHTML)) !== null) {
	const resultHtml = match[0];
	const titleUrlMatch = titleUrlRegex.exec(resultHtml);
	const snippetMatch = snippetRegex.exec(resultHtml);

	if (titleUrlMatch && titleUrlMatch[1] && titleUrlMatch[2]) {
	const url = convertHTMLSpecialChars(titleUrlMatch[1]);
	let title = titleUrlMatch[2].replace(/<\/?[^>]+(>\|$)/g, "");
	let snippet = snippetMatch
	? snippetMatch[1].replace(/<\/?[^>]+(>\|$)/g, "")
	: "";

	let engines = [];
	let engineMatch;
	while ((engineMatch = enginesRegex.exec(resultHtml)) !== null) {
	engines.push(engineMatch[1]);
	}

	let cached = null;
	let linkMatch;
	// while ((linkMatch = linksRegex.exec(resultHtml)) !== null) {
	// cached = linkMatch[1];
	// }

	title = convertHTMLSpecialChars(title);
	snippet = convertHTMLSpecialChars(snippet);
	// if (!url.includes(".de/"))
	results.push({ title, url, snippet });
	}
	}

	if (results.length === 0 && maxRetries > 0) {
	results = await searchWeb(query, {
	...options,
	maxRetries: maxRetries - 1,
	// useProxy: true
	});
	}

	//filter out url that end with .de
	// results = results.filter((result) => !result.url.includes(".de/"));

	return results;
	// } catch (error) {
	// console.error(`Error fetching search results: ${error.message}`);
	// return [];
	// }
	} catch (error) {
	if ((error as Error).message === "ERR_TLS_CERT_ALTNAME_INVALID") {
	console.error(`TLS certificate error: ${(error as Error).message}`);
	} else if ((error as Error).message.includes("Unable to connect")) {
	console.error(`Network error: ${(error as Error).message}`);
	} else {
	console.error(
	`Error fetching search results: ${(error as Error).message}`
	);
	}
	if (verbose) {
	console.error(`Failed URL: ${url}`);
	}
	if (maxRetries > 0) {
	// console.log(`Retrying... (${maxRetries} attempts left)`);
	return await searchWeb(query, { ...options, maxRetries: maxRetries - 1 });
	}
	return [];
	}
	}


	// searchWeb("tata motors new cars", {
	// category: 0,
	// recency: 0,
	// maxRetries: 50,
	// }).then(console.log);
No results found