Skip to content

Instantly share code, notes, and snippets.

@Frenzycore
Created December 22, 2025 10:07
Show Gist options
  • Select an option

  • Save Frenzycore/f8b45d4648367e297cfdc95b34e700bc to your computer and use it in GitHub Desktop.

Select an option

Save Frenzycore/f8b45d4648367e297cfdc95b34e700bc to your computer and use it in GitHub Desktop.
scraper for site https://www.nasa.gov
import axios from "axios";
import * as cheerio from "cheerio"
async function scrapeNasa() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
});
const $ = cheerio.load(html);
const mainStorySlider = $('.hds-nasa-mag-wrapper').first();
const mainStory = {
title: mainStorySlider.find('h2.display-72').text().trim() || null,
description: mainStorySlider.find('p.maxw-tablet').text().trim() || null,
link: mainStorySlider.find('a.usa-button--secondary').attr('href') || null,
image: mainStorySlider.find('figure.hds-media-background img').attr('src') || null,
relatedLinks: mainStorySlider.find('.hds-nasa-mag-col').map((_, el) => ({
category: $(el).find('h3.label').text().trim(),
title: $(el).find('a span').text().trim(),
link: $(el).find('a').attr('href')
})).get()
};
const featuredNews = $('.wp-block-nasa-blocks-news-manual .latest-news-items a[href]').map((_, el) => {
const element = $(el);
return {
title: element.find('p.heading-22 span, p.heading-14').text().trim(),
link: element.attr('href'),
image: element.find('figure.hds-media-background img').attr('src') || element.find('figure.hds-media-background img').attr('data-src'),
type: element.find('.label svg + span').text().trim(),
readTime: element.find('.label').not(':has(svg)').text().trim()
};
}).get().filter(item => item.title);
const videoSection = $('.wp-block-nasa-blocks-featured-video');
const featuredVideo = {
title: videoSection.find('h3.heading-36').text().trim(),
description: videoSection.find('p.color-carbon-20').text().trim(),
videoUrl: videoSection.find('iframe').attr('src'),
learnMoreLink: videoSection.find('a.button-primary').attr('href')
};
const iotdSection = $('.wp-block-nasa-blocks-image-of-the-day');
const onClickAttr = iotdSection.find('.hds-image-download-wrapper').attr('onClick') || '';
const pageUrlMatch = onClickAttr.match(/'([^']+)'/);
const imageOfTheDay = {
title: iotdSection.find('p.heading-22').text().trim(),
description: iotdSection.find('p.p-md').first().text().trim(),
imageUrl: iotdSection.find('.hds-media-inner img').attr('src'),
pageUrl: pageUrlMatch ? pageUrlMatch[1] : null,
downloadUrl: iotdSection.find('a.hds-image-download-link').attr('href')
};
const imageGalleries = $('.card-carousel-slider .hds-card-gallery').map((_, el) => {
const element = $(el);
return {
title: element.find('h3.heading-16').text().trim(),
link: element.attr('href'),
stats: element.find('.display-flex.label').text().trim().replace(/\s+/g, ' '),
thumbnails: element.find('.hds-card-gallery-images img').map((_, img) => $(img).attr('src')).get()
};
}).get();
const topics = $('a.hds-card-topic').map((_, el) => {
const element = $(el);
return {
title: element.find('.hds-topic-card-heading span').text().trim(),
link: element.attr('href'),
image: element.find('.hds-media-background img').attr('src')
};
}).get();
const artemisStorySection = $('.wp-block-nasa-blocks-story');
const artemisStory = {
subtitle: artemisStorySection.find('h3.subtitle-md').text().trim(),
title: artemisStorySection.find('h2.display-48').text().trim(),
description: artemisStorySection.find('p.p-md').text().trim(),
link: artemisStorySection.find('a.button-primary').attr('href'),
image: artemisStorySection.find('figure.hds-media-inner img').attr('src')
}
return {
mainStory,
featuredNews,
featuredVideo,
imageOfTheDay,
imageGalleries,
artemisStory,
topics
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaPlus() {
try {
const { data: html } = await axios.get("https://plus.nasa.gov/", {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
},
});
const $ = cheerio.load(html);
const heroBanners = $(".banner_carousel .splide__slide")
.map((_, el) => {
const element = $(el);
const backgroundUrl = element.css("background-image") || "";
return {
title:
element.find(".banner--logo img").attr("alt") ||
element.find("h1.banner-title").text().trim(),
description: element
.find(".banner--info-text p")
.text()
.trim(),
watchLink:
element
.find(".banner--info-actions a")
.first()
.attr("href") || null,
logoImage:
element.find(".banner--logo img").attr("src") || null,
backgroundImage: backgroundUrl.replace(
/url\(['"]?(.*?)['"]?\)/,
"$1"
),
};
})
.get();
const playlists = $("div.playlists article.playlist")
.map((_, el) => {
const playlist = $(el);
const title = playlist.find("h3.playlist--title").text().trim();
const videos = playlist
.find("article.video-grid")
.map((_, vid) => {
const video = $(vid);
const thumbUrl =
video
.find("figure.video-grid--thumbnail")
.css("background-image") || "";
return {
title: video
.find("h4.video-grid--title")
.text()
.trim(),
link: video.find("a.video-grid--link").attr("href"),
thumbnail: thumbUrl.replace(
/url\(['"]?(.*?)['"]?\)/,
"$1"
),
duration:
video
.find("p.font-family-mono")
.text()
.trim() || null,
primaryTag:
video.find(".tag-icon-group a").text().trim() ||
null,
};
})
.get();
return { title, videos };
})
.get();
const exploreTopics = $(".topics-menu--single")
.map((_, el) => {
const topic = $(el);
const topicUrl =
topic
.find("figure.topic--thumbnail")
.css("background-image") || "";
return {
name: topic.find("h4").text().trim(),
link: topic.find("a").attr("href"),
image: topicUrl.replace(/url\(['"]?(.*?)['"]?\)/, "$1"),
};
})
.get();
const nasaSeries = $("section.series_carousel article.series-grid")
.map((_, el) => {
const series = $(el);
const seriesUrl =
series
.find("figure.series-grid--thumbnail")
.css("background-image") || "";
return {
title: series.find("h4.series-grid--title").text().trim(),
link: series.find("a.series-grid--link").attr("href"),
image: seriesUrl.replace(/url\(['"]?(.*?)['"]?\)/, "$1"),
episodeInfo: series
.find("p.font-family-mono")
.text()
.trim(),
};
})
.get();
return {
heroBanners,
playlists,
exploreTopics,
nasaSeries,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaNews() {
const url = "https://www.nasa.gov/news/";
try {
const { data: html } = await axios.get(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
},
});
const $ = cheerio.load(html);
const pageTitle = $("h1.display-72").text().trim();
const articles = $(
"div.wp-block-nasa-blocks-news-automated a.latest-news-item"
)
.map((_, el) => {
const $article = $(el);
const title = $article.find("p.heading-22 span").text().trim();
const link = $article.attr("href");
const category = $article
.find("div.label svg + span")
.text()
.trim();
const readTime = $article
.find(".display-block .label")
.text()
.trim();
const imageUrl = $article
.find("figure.hds-media-background img")
.attr("src");
return {
title,
link: link ? new URL(link, url).href : null,
category: category || "General",
readTime,
imageUrl,
};
})
.get();
return {
source: "NASA News",
pageTitle,
articles,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaEvents() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/events/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const events = $('ul.hds-event-items li').map((_, el) => {
const element = $(el);
const title = element.find('h3.hds-event-title').text().trim();
const url = element.find('a.hds-event-item').attr('href');
const date = element.find('div.hds-event-date-time span.hds-event-date').text().trim();
const eventType = element.find('span.hds-event-type').text().trim();
const imageUrl = element.find('div.hds-event-thumbnail img').attr('src');
return {
title,
url,
date,
eventType,
imageUrl
};
}).get();
return {
pageTitle: $('h1.heading-41').text().trim(),
events
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaLaunches() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/event-type/launch-schedule/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }
});
const $ = cheerio.load(html);
const launchItems = $('ul.hds-event-items > li');
const launches = launchItems.map((_, element) => {
const item = $(element).find('a.hds-event-item');
const title = item.find('h3.hds-event-title').text().trim();
const date = item.find('div.hds-event-date-time span').text().trim();
const detailsUrl = item.attr('href');
const imageUrl = item.find('.hds-event-thumbnail img').attr('src');
const eventType = item.find('.hds-event-type').text().trim();
return {
title,
date,
eventType,
detailsUrl: detailsUrl || null,
imageUrl: imageUrl || null,
};
}).get();
return {
source: 'NASA Launch Schedule',
url: 'https://www.nasa.gov/event-type/launch-schedule/',
launches,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaMultimedia() {
const url = "https://www.nasa.gov/multimedia/";
try {
const { data: html } = await axios.get(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
},
});
const $ = cheerio.load(html);
const pageTitle = $("h1.page-heading-lg").text().trim();
const featuredSections = $(".hds-story")
.map((_, el) => {
const element = $(el);
const subtitle = element.find(".subtitle-md").text().trim();
const title = element.find(".display-48").text().trim();
const description = element.find("p.p-md").text().trim();
const linkElement = element.find("a.button-primary");
const link = linkElement.attr("href") || "";
const linkText = linkElement.find("span").text().trim();
const imageUrl = element.find("figure img").attr("src") || "";
return {
type: subtitle,
title,
description,
link: link.startsWith("/")
? `https://www.nasa.gov${link}`
: link,
linkText,
imageUrl,
};
})
.get();
const moreOnlineFeatures = $(".hds-card-grid .hds-card-custom")
.map((_, el) => {
const element = $(el);
const title = element.find("h3.heading-18").text().trim();
const description = element.find("p").first().text().trim();
const link =
element.find("a.button-primary").attr("href") || "";
const imageUrl =
element.find(".hds-card-thumbnail img").attr("src") || "";
return {
title,
description,
link: link.startsWith("/")
? `https://www.nasa.gov${link}`
: link,
imageUrl,
};
})
.get();
const discoverMore = $(".hds-topic-cards a.topic-card")
.map((_, el) => {
const element = $(el);
const title = element
.find(".hds-topic-card-heading span")
.text()
.trim();
const link = element.attr("href") || "";
const imageUrl = element.find("figure img").attr("src") || "";
return {
title,
link: link.startsWith("/")
? `https://www.nasa.gov${link}`
: link,
imageUrl,
};
})
.get();
return {
pageTitle,
featuredSections,
moreOnlineFeatures,
discoverMore,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaMissions() {
try {
const { data: html } = await axios.get(
"https://www.nasa.gov/nasa-missions/",
{
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
},
}
);
const $ = cheerio.load(html);
const missionCategories = $(".hds-tabbed-section-tab")
.map((_, el) => {
const element = $(el);
return {
category: element.find("h2.heading-29").text().trim(),
description: element.find("p.p-md").text().trim(),
link: element.find("a.button-primary").attr("href") || null,
imageUrl:
element
.find(".hds-tabbed-section-image img")
.attr("src") || null,
imageCaption:
element.find(".hds-caption-text").text().trim() || null,
};
})
.get();
const artemisCallout = {
title: $(".wp-block-nasa-blocks-callout h2.page-heading-md")
.text()
.trim(),
description: $(".wp-block-nasa-blocks-callout p.p-lg")
.text()
.trim(),
link:
$(".wp-block-nasa-blocks-callout a.button-primary").attr(
"href"
) || null,
imageUrl:
$(
".wp-block-nasa-blocks-callout figure.hds-media-background img"
).attr("src") || null,
};
const featuredMissions = $(".hds-card-grid .hds-card-custom")
.map((_, el) => {
const element = $(el);
return {
title: element.find("h3.heading-18").text().trim(),
description: element.find("p.margin-top-0").text().trim(),
link: element.find("a.button-primary").attr("href") || null,
imageUrl:
element
.find("figure.hds-media-background img")
.attr("src") || null,
};
})
.get();
const moreResources = $(".hds-featured-link-list .grid-row.padding-y-2")
.map((_, el) => {
const element = $(el);
return {
title: element.find("h2.heading-36").text().trim(),
link: element.find("a").attr("href") || null,
imageUrl:
element
.find("figure.hds-media-background img")
.attr("src") || null,
};
})
.get();
return {
pageTitle: $("h1.page-heading-md").first().text().trim(),
introduction: $("p.p-lg").first().text().trim(),
missionCategories,
artemisCallout,
featuredMissions,
moreResources,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaHumansInSpace() {
const url = 'https://www.nasa.gov/humans-in-space/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const introSection = $('.wp-block-nasa-blocks-page-intro');
const pageTitle = introSection.find('h1.page-heading-md').text().trim();
const pageDescription = introSection.find('p.p-lg').text().trim();
const heroImage = introSection.find('figure.hds-media-background img').attr('src');
const latestExpeditions = $('.hds-card-carousel .card-carousel-slider .hds-card').map((_, el) => {
const linkElement = $(el).find('a.button-primary');
return {
title: linkElement.find('span').text().trim(),
link: new URL(linkElement.attr('href'), url).href,
image: $(el).find('figure.hds-media-background img').attr('src')
};
}).get();
const callouts = $('.wp-block-nasa-blocks-callout').map((_, el) => {
const linkElement = $(el).find('a.button-primary');
return {
title: $(el).find('h2.page-heading-md').text().trim(),
description: $(el).find('p.p-lg').text().trim(),
link: new URL(linkElement.attr('href'), url).href,
backgroundImage: $(el).find('figure.hds-media-background img').attr('src')
};
}).get();
const featuredVideos = $('.wp-block-nasa-blocks-featured-video').map((_, el) => ({
title: $(el).find('h3.heading-36').text().trim(),
description: $(el).find('p.color-carbon-20').text().trim(),
videoUrl: $(el).find('iframe').attr('src'),
moreInfoLink: new URL($(el).find('a.button-primary').attr('href'), url).href
})).get();
const newsItems = [];
const processedLinks = new Set();
$('.wp-block-nasa-blocks-news-automated .grid-row a').each((_, el) => {
const link = $(el).attr('href');
if (link && !processedLinks.has(link)) {
processedLinks.add(link);
newsItems.push({
title: $(el).find('p.heading-22, p.heading-14').text().trim(),
link: new URL(link, url).href,
image: $(el).find('img').attr('src'),
type: $(el).find('.label svg + span').text().trim(),
readTime: $(el).find('.label').contents().filter((_, node) => node.type === 'text').text().trim()
});
}
});
const discoverMoreTopics = $('.wp-block-nasa-blocks-topic-cards .topic-card').map((_, el) => ({
title: $(el).find('p.hds-topic-card-heading span').text().trim(),
link: new URL($(el).attr('href'), url).href,
image: $(el).find('figure.hds-media-background img').attr('src')
})).get();
return {
pageTitle,
pageDescription,
heroImage,
latestExpeditions,
callouts,
featuredVideos,
latestNews: newsItems,
discoverMoreTopics
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaScience() {
const url = 'https://science.nasa.gov/';
try {
const { data: html } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
});
const $ = cheerio.load(html);
const heroSection = {
title: $('h1.page-heading-md').text().trim(),
description: $('div.hds-mission-header p.p-lg').text().trim(),
backgroundImage: $('.hds-mission-header > figure.hds-media-background img').attr('src'),
featuredLinks: $('.hds-mission-header .grid-col-12.desktop\\:grid-col-4').map((_, el) => ({
category: $(el).find('p.label').text().trim(),
title: $(el).find('a.button-primary span').text().trim(),
link: $(el).find('a.button-primary').attr('href'),
})).get()
};
const featuredMissions = $('.hds-card-grid-cards .hds-card-custom').map((_, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
description: $(el).find('p.margin-top-0').text().trim(),
link: $(el).find('a.button-primary').attr('href'),
image: $(el).find('.hds-card-thumbnail img').attr('src'),
})).get();
const latestNews = $('.latest-news-items a.latest-news-item').map((_, el) => ({
title: $(el).find('p[class*="heading-"]').text().trim(),
link: $(el).attr('href'),
image: $(el).find('figure img').attr('src'),
readTime: $(el).find('.label').first().text().trim(),
type: $(el).find('svg + span').text().trim() || 'Article'
})).get();
const featuredVideo = {
title: $('.hds-featured-video h3.heading-36').text().trim().replace(/\s+/g, ' '),
description: $('.hds-featured-video p.color-carbon-20').text().trim(),
videoUrl: $('.hds-featured-video iframe').attr('src'),
link: $('.hds-featured-video a.button-primary').attr('href')
};
const featuredStory = {
title: $('.hds-featured-story h1.display-60').text().trim(),
description: $('.hds-featured-story p.p-lg').text().trim(),
link: $('.hds-featured-story a.button-primary').attr('href'),
image: $('.hds-featured-story > figure.hds-media-background img').attr('src')
};
const diveDeeper = $('.card-carousel-slider .hds-card-custom').map((_, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
description: $(el).find('p.margin-top-0').text().trim(),
link: $(el).find('a.button-primary').attr('href'),
image: $(el).find('.hds-card-thumbnail img').attr('src')
})).get();
return {
heroSection,
featuredMissions,
latestNews,
featuredVideo,
featuredStory,
diveDeeper
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaScienceEarth() {
const url = "https://science.nasa.gov/earth/";
try {
const { data: html } = await axios.get(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
},
});
const $ = cheerio.load(html);
const mainTitle = $("h1.page-heading-md").text().trim();
const mainSubtitle = $("p.p-lg.color-carbon-30-important")
.text()
.trim();
const mainImage = $(
".hds-page-intro-banner figure.hds-media-background img"
).attr("src");
const recentNews = $(
'div.hds-card-grid:has(h2:contains("Recent News and Articles")) a.hds-card-article'
)
.map((_, el) => {
const element = $(el);
const title = element.find("h3.heading-18").text().trim();
const link = new URL(element.attr("href"), url).href;
const image = element
.find("figure.hds-media-background img")
.attr("src");
const readTime = element.find(".label").first().text().trim();
const date = element
.find(".label.related-article-label")
.text()
.replace(/Article/g, "")
.trim();
return { title, link, image, readTime, date };
})
.get();
const earthObservatoryImages = $(
"#post-list-container .hds-content-item"
)
.map((_, el) => {
const element = $(el);
const title = element
.find(".hds-a11y-heading-22")
.text()
.trim();
const link = new URL(
element.find("a.hds-content-item-heading").attr("href"),
url
).href;
const image = element
.find("figure.hds-media-background img")
.attr("src");
const description = element.find("p").text().trim();
const readTime = element
.find(".hds-content-item-readtime")
.text()
.trim();
const date = element.find(".label.margin-y-1").text().trim();
return { title, link, image, description, readTime, date };
})
.get();
const featuredVideos = $(".hds-featured-video")
.map((_, el) => {
const element = $(el);
const title = element.find("h3.heading-36").text().trim();
const description = element.find("p.p-sm").text().trim();
const watchLink = element.find("a.button-primary").attr("href");
const embedUrl = element.find("iframe").attr("src");
return { title, description, watchLink, embedUrl };
})
.get();
const keepExploring = $(".hds-topic-cards-wrapper a.topic-card")
.map((_, el) => {
const element = $(el);
const title = element
.find(".hds-topic-card-heading span")
.text()
.trim();
const link = new URL(element.attr("href"), url).href;
const image = element
.find("figure.hds-media-background img")
.attr("src");
const description =
element.find("p.margin-bottom-0").text().trim() || null;
return { title, link, image, description };
})
.get();
return {
pageTitle: mainTitle,
pageSubtitle: mainSubtitle,
heroImage: mainImage,
recentNews,
earthObservatoryImages,
featuredVideos,
keepExploring,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaScienceSolarSystem() {
const url = 'https://science.nasa.gov/solar-system/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' }
});
const $ = cheerio.load(html);
const heroSection = {
title: $('.hds-topic-hero h1').text().trim(),
subtitle: $('.hds-topic-hero p.p-lg').text().trim(),
imageUrl: $('.hds-topic-hero figure img').attr('src') || null,
stats: $('.hds-topic-hero-stats .grid-row .grid-col.grid-row').map((i, el) => ({
value: $(el).find('.stat-number').text().trim(),
label: $(el).find('.stat-value').text().trim()
})).get()
};
const tenThings = $('.hds-meet-the-card').map((i, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
description: $(el).find('p').text().trim(),
link: $(el).find('a').attr('href') || null,
imageUrl: $(el).find('figure img').attr('src') || null
})).get();
const mainTabs = $('.hds-tabbed-section-tab').map((i, el) => ({
tabName: $(`#tab${i}-${$(el).attr('id').substring(6)}`).text().trim(),
title: $(el).find('h2.heading-29').text().trim(),
description: [$(el).find('p.heading-16').text().trim(), $(el).find('p.p-md').text().trim()].filter(Boolean).join(' '),
link: $(el).find('a.button-primary').attr('href') || null,
image: {
url: $(el).find('figure img').attr('src') || null,
caption: $(el).find('figcaption .hds-caption-text').text().trim(),
credit: $(el).find('figcaption .hds-credits').text().trim()
}
})).get();
const featuredVideo = {
label: $('.hds-featured-video .label').text().trim(),
title: $('.hds-featured-video h3').text().trim(),
description: $('.hds-featured-video p').text().trim(),
learnMoreLink: $('.hds-featured-video a.button-primary').attr('href') || null,
videoUrl: $('.hds-featured-video iframe').attr('src') || null
};
const getCards = (headingText) => {
return $(`h2.section-heading-sm:contains("${headingText}")`).closest('.hds-card-grid').find('.hds-card-custom').map((i, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
description: $(el).find('p').text().trim() || null,
link: $(el).find('a.button-primary').attr('href') || null,
imageUrl: $(el).find('figure img').attr('src') || null
})).get();
};
const newsItems = $('.latest-news-item a').map((i, el) => ({
title: $(el).find('p.heading-22 span').text().trim(),
link: $(el).attr('href'),
type: $(el).find('.label span').text().trim(),
readTime: $(el).find('.label').first().contents().filter((_, node) => node.type === 'text').text().trim(),
imageUrl: $(el).find('figure img').attr('src')
})).get();
$('.hds-news-item-bubble').closest('a').each((i, el) => {
const item = $(el);
newsItems.push({
title: item.find('p.heading-14').text().trim(),
link: item.attr('href'),
type: item.find('svg + span').text().trim(),
readTime: item.find('.label').first().contents().filter((_, node) => node.type === 'text').text().trim(),
imageUrl: item.find('figure img').attr('src')
});
});
const discoverTopics = $('.hds-topic-cards-wrapper a.topic-card').map((i, el) => ({
title: $(el).find('p.hds-topic-card-heading span').text().trim(),
link: $(el).attr('href'),
imageUrl: $(el).find('figure img').attr('src') || null
})).get();
return {
pageTitle: $('title').text().trim(),
hero: heroSection,
tenThingsAboutSolarSystem: tenThings,
mainContentTabs: mainTabs,
featuredVideo,
eyesOnSolarSystem: {
title: $('h2:contains("Eyes on the Solar System")').text().trim(),
iframeUrl: $('iframe[src*="eyes.nasa.gov"]').attr('src') || null,
caption: $('iframe[src*="eyes.nasa.gov"]').closest('.SmdBlockIframeEmbedBlock').find('.hds-caption-text').text().trim(),
credit: $('iframe[src*="eyes.nasa.gov"]').closest('.SmdBlockIframeEmbedBlock').find('.hds-credits').text().trim()
},
featuredMissions: getCards('Featured Missions'),
featuredArticles: getCards('Featured Articles'),
resources: getCards('Resources'),
solarSystemNews: newsItems,
discoverMoreTopics: discoverTopics
};
} catch (error) {
console.error(`Error scraping ${url}:`, error);
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaScienceUniverse() {
const url = 'https://science.nasa.gov/universe/';
try {
const { data: html } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
const $ = cheerio.load(html);
const heroSection = {
title: $('.hds-page-intro h1.page-heading-md').text().trim(),
description: $('.hds-page-intro p.p-lg').text().trim(),
backgroundVideo: $('.hds-page-intro video source').attr('src') || ''
};
const exploreTopics = $('.hds-meet-the-card').map((i, el) => {
const linkElement = $(el).find('a').first();
return {
title: $(el).find('h3.heading-18').text().trim(),
url: linkElement.attr('href') || '',
description: $(el).find('p.p-sm').text().trim(),
imageUrl: $(el).find('img').attr('src') || ''
};
}).get();
const featuredStory = {
title: $('.wp-block-nasa-blocks-featured-link h2.page-heading-md').text().trim(),
description: $('.wp-block-nasa-blocks-featured-link p.p-md').text().trim(),
link: $('.wp-block-nasa-blocks-featured-link a.button-primary').attr('href') || '',
imageUrl: $('.wp-block-nasa-blocks-featured-link figure img').attr('src') || '',
imageAlt: $('.wp-block-nasa-blocks-featured-link figure img').attr('alt') || '',
imageCaption: $('.wp-block-nasa-blocks-featured-link figcaption .hds-caption-text').text().trim()
};
const featuredVideo = {
title: $('.hds-featured-video h3.heading-36').text().trim(),
description: $('.hds-featured-video p.p-sm').text().trim(),
videoUrl: $('.hds-featured-video iframe').attr('src') || '',
learnMoreLink: $('.hds-featured-video a.button-primary').attr('href') || ''
};
const callouts = $('.wp-block-nasa-blocks-callout').map((i, el) => ({
title: $(el).find('h2.page-heading-md').text().trim(),
description: $(el).find('p.p-lg').text().trim(),
link: $(el).find('a.button-primary').attr('href') || '',
imageUrl: $(el).find('figure img').attr('src') || ''
})).get();
const newsArticles = $('.wp-block-nasa-blocks-news-automated .latest-news-item a').map((i, el) => ({
title: $(el).find('p.heading-22 span').text().trim(),
url: $(el).attr('href') || '',
readTime: $(el).find('.label').first().text().trim().replace(/ Min Read/g, ' min read'),
imageUrl: $(el).find('figure img').attr('src') || ''
})).get();
return {
pageTitle: $('title').text().trim(),
hero: heroSection,
exploreTopics,
featuredStory,
featuredVideo,
callouts,
news: {
sectionTitle: $('.wp-block-nasa-blocks-news-automated h2.section-heading-md').text().trim(),
articles: newsArticles
}
};
} catch (error) {
throw new Error(`Scraping failed for ${url}: ${error.message}`);
}
}
async function scrapeNasaAeronautics() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/aeronautics/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const heroSection = $('.hds-topic-hero');
const hero = {
title: heroSection.find('h1').text().trim(),
description: heroSection.find('p.p-lg').text().trim(),
image: heroSection.find('figure img').attr('src'),
imageAlt: heroSection.find('figure img').attr('alt'),
caption: heroSection.find('h2.label').text().trim()
};
const featuredContent = $('.hds-meet-the').first().find('.hds-meet-the-card').map((_, el) => ({
title: $(el).find('h3').text().trim(),
description: $(el).find('p').text().trim(),
link: $(el).find('a').first().attr('href'),
image: $(el).find('img').attr('src'),
})).get();
const latestNews = $('.wp-block-nasa-blocks-news-automated .latest-news-item a').map((_, el) => ({
title: $(el).find('p.heading-22 span').text().trim(),
link: $(el).attr('href'),
image: $(el).find('figure img').attr('src'),
readTime: $(el).find('div.label').first().text().trim(),
type: $(el).find('svg').next('span').text().trim()
})).get();
const secondaryNews = $('.wp-block-nasa-blocks-news-automated .hds-news-item-bubble').map((_, el) => ({
title: $(el).find('p.heading-14').text().trim(),
link: $(el).closest('a').attr('href'),
image: $(el).find('figure img').attr('src'),
readTime: $(el).find('div.label').first().text().trim(),
type: $(el).find('svg').next('span').text().trim()
})).get();
const aviationVisionCards = $('.wp-block-nasa-blocks-card-grid .hds-card-custom').map((_, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
description: $(el).find('p').text().trim(),
link: $(el).find('a.button-primary').attr('href'),
image: $(el).find('figure img').attr('src')
})).get();
const otherTopics = $('.hds-meet-the').last().find('.hds-meet-the-card').map((_, el) => ({
title: $(el).find('h3').text().trim(),
description: $(el).find('p').text().trim(),
link: $(el).find('a').first().attr('href'),
image: $(el).find('img').attr('src'),
})).get();
const discoverMoreTopics = $('.wp-block-nasa-blocks-topic-cards .topic-card').map((_, el) => ({
title: $(el).find('.hds-topic-card-heading span').text().trim(),
link: $(el).attr('href'),
image: $(el).find('figure img').attr('src')
})).get();
return {
pageTitle: $('title').text().trim(),
hero,
featuredContent,
latestNews: [...latestNews, ...secondaryNews],
featuredVideo: {
title: $('h2.wp-block-heading:contains("New Video")').text().trim(),
embedUrl: $('figure.wp-block-embed-youtube iframe').attr('src')
},
researchDirectorate: {
title: $('.hds-ask-nasa').eq(0).find('h2.display-48').text().trim(),
description: $('.hds-ask-nasa').eq(0).find('p.p-md').text().trim(),
link: $('.hds-ask-nasa').eq(0).find('a.button-primary').attr('href'),
image: $('.hds-ask-nasa').eq(0).find('figure img').attr('src'),
imageCredit: $('.hds-ask-nasa').eq(0).find('.hds-credits').text().trim()
},
aviationVision: {
title: $('.wp-block-nasa-blocks-card-grid h2.section-heading-sm').text().trim(),
description: $('.wp-block-nasa-blocks-card-grid > .hds-card-grid-header p').text().trim(),
cards: aviationVisionCards
},
quesstMission: {
title: $('.wp-block-nasa-blocks-featured-link h2.page-heading-md').text().trim(),
description: $('.wp-block-nasa-blocks-featured-link p.p-md').text().trim(),
link: $('.wp-block-nasa-blocks-featured-link a.button-primary').attr('href'),
image: $('.wp-block-nasa-blocks-featured-link figure img').attr('src'),
imageCredit: $('.wp-block-nasa-blocks-featured-link .hds-credits').text().trim()
},
otherTopics,
discoverMoreTopics
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaTechnology() {
const url = 'https://www.nasa.gov/technology/';
try {
const { data: html } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
});
const $ = cheerio.load(html);
const heroSection = {
title: $('div.hds-topic-hero h1').text().trim(),
description: $('div.hds-topic-hero p.p-lg').text().trim(),
backgroundImage: $('div.hds-topic-hero figure img').attr('src') || ''
};
const featuredPodcast = {
seriesTitle: $('div.hds-featured-podcasts h2.heading-41').text().trim(),
seriesDescription: $('div.hds-featured-podcasts p.p-sm').first().text().trim(),
seriesLink: new URL($('div.hds-featured-podcasts a.button-primary').attr('href'), url).href,
latestEpisode: {
title: $('div.hds-featured-podcasts h3.heading-18').text().trim(),
date: $('div.hds-featured-podcasts .heading-12').text().trim(),
audioUrl: $('div.hds-audio-player-wrap audio source').attr('src') || '',
detailsLink: new URL($('div.hds-featured-podcasts a.hds-link').attr('href'), url).href
}
};
const latestNews = $('.latest-news-items a.latest-news-item').map((_, el) => {
const $el = $(el);
const title = $el.find('p[class*="heading-"]').text().trim();
const link = new URL($el.attr('href'), url).href;
const image = $el.find('figure img').attr('src') || '';
const readTime = $el.find('.label').not(':has(svg)').text().trim();
return { title, link, image, readTime };
}).get();
const technologySubtopics = $('.wp-block-nasa-blocks-featured-link-list .featured-link-list-row').map((_, el) => {
const $el = $(el);
const title = $el.find('h2.heading-36').text().trim();
const description = $el.find('p.p-md').text().trim();
const link = new URL($el.find('a').attr('href'), url).href;
const image = $el.find('figure img').attr('src') || '';
return { title, description, link, image };
}).get();
const facesOfTechnology = $('.hds-meet-the-card').map((_, el) => {
const $el = $(el);
const name = $el.find('h3.heading-18').text().trim();
const role = $el.find('p.p-sm').text().trim();
const videoLink = $el.find('a').attr('href') || '';
const image = $el.find('figure img').attr('src') || '';
return { name, role, videoLink, image };
}).get();
return {
pageTitle: $('title').text().trim(),
heroSection,
featuredPodcast,
latestNews,
technologySubtopics,
facesOfTechnology
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaLearningResources() {
const url = 'https://www.nasa.gov/learning-resources/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const resolveUrl = (relativeUrl) => relativeUrl ? new URL(relativeUrl, url).href : null;
const heroSection = {
title: $('div.hds-topic-hero h1.display-72').text().trim(),
description: $('div.hds-topic-hero p.p-lg').text().trim(),
imageUrl: resolveUrl($('div.hds-topic-hero figure.hds-media-background img').attr('src')),
};
const findYourPlaceInSpace = $('div.hds-meet-the').first().find('.hds-meet-the-card').map((_, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
link: resolveUrl($(el).find('a').first().attr('href')),
imageUrl: resolveUrl($(el).find('a.hds-meet-the-image img').attr('src')),
description: $(el).find('p.p-sm').text().trim(),
})).get();
const learningSections = $('.hds-tabbed-section-tab').map((_, el) => ({
title: $(el).find('h2.heading-29').text().trim(),
subtitle: $(el).find('p.heading-16').text().trim(),
description: $(el).find('p.p-md').text().trim(),
link: resolveUrl($(el).find('a.button-primary').attr('href')),
imageUrl: resolveUrl($(el).find('.hds-tabbed-section-image img').attr('src')),
})).get();
const latestNews = $('div.wp-block-nasa-blocks-news-automated .latest-news-item a').map((_, el) => ({
title: $(el).find('p.heading-22, p.heading-14').text().trim(),
link: resolveUrl($(el).attr('href')),
imageUrl: resolveUrl($(el).find('figure img').attr('src') || $(el).find('img').attr('src')),
readTime: $(el).find('div.label').first().text().trim(),
})).get();
const diveIntoStem = $('div.wp-block-nasa-blocks-featured-link-list .featured-link-list-row').map((_, el) => ({
title: $(el).find('h2.heading-36').text().trim(),
description: $(el).find('p.p-md').text().trim(),
link: resolveUrl($(el).find('a').last().attr('href')),
imageUrl: resolveUrl($(el).find('figure.hds-media-background img').attr('src')),
})).get();
const socialLinks = $('div#social.hds-meet-the .hds-meet-the-card').map((_, el) => ({
platform: $(el).find('h3.heading-18').text().trim(),
link: resolveUrl($(el).find('a').first().attr('href')),
imageUrl: resolveUrl($(el).find('a.hds-meet-the-image img').attr('src')),
})).get();
return {
hero: heroSection,
findYourPlaceInSpace,
learningSections,
latestNews,
diveIntoStem,
socialLinks,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaAbout() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/about/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36' }
});
const $ = cheerio.load(html);
const pageTitle = $('h1.page-heading-md').text().trim();
const introDescription = $('div.hds-page-intro p.p-lg').text().trim();
const mainSections = $('.hds-tabbed-section-tab').map((i, el) => {
const element = $(el);
const linkElement = element.find('a.button-primary');
return {
title: element.find('h2.heading-29').text().trim(),
description: element.find('p.p-md').text().trim(),
link: {
text: linkElement.text().trim().replace(/\s\s+/g, ' '),
url: linkElement.attr('href') || null
},
imageUrl: element.find('.hds-tabbed-section-image img').attr('src') || null,
imageCaption: element.find('figcaption .hds-caption-text').text().trim() || null
};
}).get();
const leadership = [];
$('.hds-card-grid-header:contains("NASA Leadership")').closest('.hds-card-grid').find('.hds-card-custom').each((i, cardEl) => {
const card = $(cardEl);
leadership.push({
nameAndTitle: card.find('h3.heading-18').text().trim(),
description: card.find('p').text().trim(),
imageUrl: card.find('.hds-card-thumbnail img').attr('src'),
bioUrl: card.find('a.button-primary').attr('href')
});
});
const careersCalloutElement = $('.wp-block-nasa-blocks-callout:contains("Careers at NASA")');
const careersCallout = {
title: careersCalloutElement.find('h2.page-heading-md').text().trim(),
description: careersCalloutElement.find('p.p-lg').text().trim(),
link: {
text: careersCalloutElement.find('a.button-primary span').clone().children().remove().end().text().trim(),
url: careersCalloutElement.find('a.button-primary').attr('href')
},
backgroundImageUrl: careersCalloutElement.find('figure.hds-media-background img').attr('src')
};
const footerMainLinks = $('.usa-footer__primary-section .desktop\:grid-col-6 .hds-footer-menu li a').map((i, el) => ({
text: $(el).text().trim(),
url: $(el).attr('href')
})).get();
const socialLinks = $('.hds-footer-socials a').map((i, el) => ({
platform: $(el).attr('aria-label').split(' on ')[1].split(' (')[0],
url: $(el).attr('href')
})).get();
const utilityLinks = $('.hds-footer-secondary li a').map((i, el) => ({
text: $(el).text().trim(),
url: $(el).attr('href')
})).get();
return {
pageTitle,
introDescription,
mainSections,
leadership,
careersCallout,
footer: {
mainLinks: footerMainLinks,
socialLinks,
utilityLinks
}
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaEspanol() {
const url = 'https://www.nasa.gov/es/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' }
});
const $ = cheerio.load(html);
const heroSection = {
title: $('div.hds-page-intro-banner h1').text().trim(),
description: $('div.hds-page-intro-banner p.p-lg').text().trim(),
backgroundImage: $('div.hds-page-intro-banner figure.hds-media-background img').attr('src') || '',
};
const tabbedSections = $('.hds-tabbed-section-tab').map((i, el) => {
const element = $(el);
return {
title: element.find('h2.heading-29').text().trim(),
subtitle: element.find('p.heading-16').text().trim(),
description: element.find('p.p-md').text().trim(),
link: element.find('a.button-primary').attr('href') || '',
image: element.find('.hds-tabbed-section-image img').attr('src') || ''
};
}).get();
const featuredPodcast = {
seriesTitle: $('.hds-featured-podcasts h2.heading-41').text().trim(),
seriesDescription: $('.hds-featured-podcasts p.p-sm').first().text().trim(),
seriesLink: $('.hds-featured-podcasts a.button-primary').attr('href') || '',
latestEpisode: {
title: $('.hds-featured-podcasts h3.heading-18').text().trim(),
date: $('.hds-featured-podcasts .heading-12.text-uppercase').text().trim(),
audioUrl: $('.hds-featured-podcasts audio source').attr('src') || '',
detailsLink: $('.hds-featured-podcasts a.hds-link').attr('href') || ''
}
};
const latestNews = $('.hds-content-lists-inner .hds-content-item').map((i, el) => {
const element = $(el);
return {
title: element.find('.hds-a11y-heading-22').text().trim(),
link: element.find('a.hds-content-item-heading').attr('href') || '',
image: element.find('.hds-content-item-thumbnail img').attr('src') || '',
excerpt: element.find('p').first().text().trim(),
readTime: element.find('.hds-content-item-readtime').text().trim() || '',
contentType: element.find('.label > span').text().trim() || 'Article'
};
}).get();
const spaceToGroundVideo = {
title: $('h2.wp-block-heading:contains("Espacio a Tierra")').text().trim(),
embedUrl: $('figure.wp-block-embed-youtube iframe').attr('src') || '',
caption: $('figure.wp-block-embed-youtube figcaption').text().trim()
};
const moreNasaPages = $('.hds-featured-link-list').first().find('.featured-link-list-row').map((i, el) => {
const element = $(el);
return {
title: element.find('h2.heading-36').text().trim(),
description: element.find('p.p-md').text().trim(),
link: element.find('a').attr('href') || '',
image: element.find('img').attr('src') || ''
};
}).get();
const socialMedia = $('.hds-featured-link-list').last().find('.featured-link-list-row').map((i, el) => {
const element = $(el);
return {
platform: element.find('h2.heading-36').text().trim(),
handle: element.find('p.p-md').text().trim(),
link: element.find('a').attr('href') || ''
};
}).get();
const discoverMoreTopics = $('.hds-topic-cards a.topic-card').map((i, el) => {
const element = $(el);
return {
title: element.find('.hds-topic-card-heading span').text().trim(),
link: new URL(element.attr('href'), url).href,
image: element.find('figure.hds-media-background img').attr('src') || ''
};
}).get();
return {
pageTitle: $('title').text().trim(),
heroSection,
tabbedSections,
featuredPodcast,
latestNews,
spaceToGroundVideo,
moreNasaPages,
socialMedia,
discoverMoreTopics
};
} catch (error) {
console.error(`Error scraping ${url}:`, error);
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaSocialMedia() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/social-media/', {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
const $ = cheerio.load(html);
const content = $('.entry-content');
const parseSocialList = (ulElement) => {
const accounts = [];
ulElement.find('li').each((_, li) => {
const link = $(li).find('a');
const text = $(li).text();
const platform = text.split(':')[0].trim();
const url = link.attr('href');
if (platform && url) {
accounts.push({ platform, url });
}
});
return accounts;
};
const parseGroupedSection = (startSelector, endSelector, isAstronaut = false) => {
const items = [];
let currentItem = null;
$(startSelector).nextUntil(endSelector).each((_, element) => {
const el = $(element);
if (el.is('p') && el.text().trim() !== '') {
if (currentItem && currentItem.accounts.length > 0) {
items.push(currentItem);
}
const name = isAstronaut ? el.find('a').text().trim() : el.text().trim();
if (name) {
currentItem = {
name: name,
accounts: []
};
}
} else if (el.is('ul') && currentItem) {
currentItem.accounts = parseSocialList(el);
}
});
if (currentItem && currentItem.accounts.length > 0) {
items.push(currentItem);
}
return items.filter(item => item.accounts.length > 0);
};
const mainAccountsList = content.find('p:contains("These accounts offer the broadest")').first().next('ul');
const mainAccounts = parseSocialList(mainAccountsList);
const spanishAccountsList = content.find('p:contains("Cuentas oficiales de la NASA en español")').first().next('ul');
const spanishAccounts = parseSocialList(spanishAccountsList);
const centers = parseGroupedSection('#centers', '#directorates');
const directorates = parseGroupedSection('#directorates', '#missions-topics');
const missions = parseGroupedSection('#missions-topics', '#leadership');
const leadership = parseGroupedSection('#leadership', '#astronauts');
const astronautHeadingSelector = 'p:contains("NASA Astronaut Candidates")';
const astronauts = parseGroupedSection('#astronauts', astronautHeadingSelector, true);
const astronautCandidates = parseGroupedSection(astronautHeadingSelector, '.hds-topic-cards', true);
return {
pageTitle: $('title').text().trim(),
hero: {
title: $('.hds-topic-hero h1').text().trim(),
description: $('.hds-topic-hero p').text().trim()
},
mainAccounts,
spanishAccounts,
centers,
directorates,
missions,
leadership,
astronauts,
astronautCandidates,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaNewsletters() {
const baseUrl = 'https://www.nasa.gov';
const targetUrl = `${baseUrl}/nasa-newsletters/`;
try {
const { data: html } = await axios.get(targetUrl, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const categories = [];
$('h3.wp-block-heading[id]').each((_, element) => {
const categoryTitle = $(element).text().trim();
const table = $(element).next('figure.wp-block-table').find('table');
const newsletters = $(table).find('tbody tr').map((_, row) => {
const cells = $(row).find('td');
if (cells.length < 2) return null;
const titleCell = $(cells[0]);
const descriptionCell = $(cells[1]);
const title = titleCell.contents().first().text().trim();
const description = descriptionCell.text().trim();
const linkElement = titleCell.find('a');
let signupLink = linkElement.attr('href') || '';
if (!title || !description) return null;
if (signupLink && !signupLink.startsWith('http')) {
signupLink = new URL(signupLink, baseUrl).href;
}
return {
title,
signupLink,
description
};
}).get().filter(Boolean);
if (newsletters.length > 0) {
categories.push({
categoryTitle,
newsletters
});
}
});
return {
pageTitle: $('title').text().trim(),
pageHeadline: $('h1.heading-41').text().trim(),
categories
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaGetInvolved() {
const baseUrl = 'https://www.nasa.gov';
const targetUrl = `${baseUrl}/get-involved/`;
try {
const { data: html } = await axios.get(targetUrl, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const getAbsoluteUrl = (path) => {
if (!path || path.startsWith('http')) {
return path;
}
return new URL(path, baseUrl).href;
};
const hero = {
title: $('.hds-topic-hero h1').text().trim(),
imageUrl: getAbsoluteUrl($('.hds-topic-hero img').attr('src')),
intro: $('.hds-page-intro p.p-lg').text().trim()
};
const challenges = $('#NASA-Missions .hds-content-item').map((_, el) => ({
title: $(el).find('.hds-a11y-heading-22').text().trim(),
link: getAbsoluteUrl($(el).find('a').attr('href')),
description: $(el).find('p').text().trim(),
imageUrl: getAbsoluteUrl($(el).find('img').attr('src'))
})).get();
const researchTabs = $('#NASA-Research .hds-tabbed-section-tab').map((i, el) => {
const tabButton = $(`#NASA-Research button#tab${i}-citizen-science, #NASA-Research button[aria-controls="panel${i}-volunteer-for-a-nasa-study"], #NASA-Research button[aria-controls*="panel${i}"]`).first();
return {
tabName: tabButton.text().trim(),
title: $(el).find('h2.heading-29').text().trim(),
description: $(el).find('p.p-md').text().trim(),
link: getAbsoluteUrl($(el).find('a.button-primary').attr('href')),
imageUrl: getAbsoluteUrl($(el).find('img').attr('src')),
}
}).get();
const studentOpportunities = $('#Student2 .hds-card-custom').map((_, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
description: $(el).find('p').text().trim(),
link: getAbsoluteUrl($(el).find('a.button-primary').attr('href')),
imageUrl: getAbsoluteUrl($(el).find('img').attr('src'))
})).get();
const virtualEvents = $('#Events .hds-tabbed-section-tab').map((i, el) => {
const tabButton = $(`#Events button[aria-controls*="panel${i}"]`).first();
return {
tabName: tabButton.text().trim(),
title: $(el).find('h2.heading-29').text().trim(),
description: $(el).find('p.p-md').text().trim(),
link: getAbsoluteUrl($(el).find('a.button-primary').attr('href')),
imageUrl: getAbsoluteUrl($(el).find('img').attr('src')),
}
}).get();
const stayConnected = $('#Stay-Connected .hds-meet-the-card').map((_, el) => ({
title: $(el).find('h3').text().trim(),
description: $(el).find('p').text().trim(),
link: getAbsoluteUrl($(el).find('a').first().attr('href')),
imageUrl: getAbsoluteUrl($(el).find('img').attr('src'))
})).get();
const discoverMore = $('.hds-topic-cards .topic-card').map((_, el) => ({
title: $(el).find('.hds-topic-card-heading span').text().trim(),
link: getAbsoluteUrl($(el).attr('href')),
imageUrl: getAbsoluteUrl($(el).find('img').attr('src'))
})).get();
return {
pageTitle: $('title').text().trim(),
hero,
sections: {
challenges,
research: researchTabs,
studentOpportunities,
virtualEvents,
stayConnected,
discoverMore
}
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaSitemap() {
const baseUrl = 'https://www.nasa.gov';
const targetUrl = `${baseUrl}/sitemap/`;
const resolveUrl = (path) => {
if (path && path.startsWith('http')) {
return path;
}
if (path && path.startsWith('/')) {
return `${baseUrl}${path}`;
}
return path;
};
try {
const { data: html } = await axios.get(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
});
const $ = cheerio.load(html);
const sitemapSections = $('.hds-sitemap-term-menu-blocks-wrapper .hds-sitemap-term-menu').map((_, el) => {
const categoryElement = $(el).find('.hds-sitemap-menu-top-link a');
const category = categoryElement.text().trim();
const categoryUrl = resolveUrl(categoryElement.attr('href'));
const links = $(el).find('li:not(.hds-sitemap-menu-top-link) a').map((_, linkEl) => ({
text: $(linkEl).text().trim(),
url: resolveUrl($(linkEl).attr('href'))
})).get();
return { category, categoryUrl, links };
}).get();
const contentArchive = $('.hds-sitemap-yearly-menu-blocks-wrapper .hds-sitemap-yearly-menu .hds-sitemap-menu-link a').map((_, el) => ({
year: $(el).text().trim(),
url: resolveUrl($(el).attr('href'))
})).get();
return {
pageTitle: $('h1.display-48').text().trim(),
sitemapSections,
contentArchive,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaMoreMissions() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/missions/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const missions = $('div.hds-search-result.mission-terms-result-container').map((_, element) => {
const el = $(element);
const titleElement = el.find('h4.mission-terms-result-title');
const title = titleElement.text().trim();
const link = titleElement.parent('a').attr('href');
const imageUrl = el.find('.mission-terms-result-image img').attr('src');
const description = el.find('.mission-terms-result-excerpt').text().trim();
return {
title,
link,
imageUrl,
description
};
}).get();
return {
source: 'NASA Missions',
pageTitle: $('title').text().trim(),
missions
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaMediaContacts() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/news/nasa-media-contacts/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const mainIntroText = $('.hds-page-intro p.p-lg').text();
const phoneRegex = new RegExp(/\d{3}-\d{3}-\d{4}/);
const mainPhoneNumber = mainIntroText.match(phoneRegex) ? mainIntroText.match(phoneRegex)[0] : null;
const getContactsFromTable = (selector) => {
return $(selector).next('figure.wp-block-table').find('tbody tr').map((_, row) => {
const cells = $(row).find('td');
const nameCell = cells.eq(0);
const contactLink = nameCell.find('a');
return {
name: contactLink.text().trim() || nameCell.text().trim(),
email: contactLink.attr('href') || null,
responsibility: cells.eq(1).text().trim(),
phone: cells.eq(2).text().trim() || null,
};
}).get();
};
const getLeadershipContacts = () => {
const contacts = [];
$('h1#interviews').nextUntil('h1', 'figure.wp-block-table').each((_, table) => {
$(table).find('tbody tr').each((_, row) => {
const cells = $(row).find('td');
const nameCell = cells.eq(0);
const contactLink = nameCell.find('a');
contacts.push({
name: contactLink.text().trim(),
email: contactLink.attr('href') || null,
title: cells.eq(1).text().trim(),
phone: cells.eq(2).text().trim() || null,
});
});
});
return contacts;
};
const getPhotographyContacts = () => {
return $('h1#photos-and-video').next('figure.wp-block-table').find('tbody tr').map((_, row) => {
const cells = $(row).find('td');
return {
office: cells.eq(0).text().trim(),
phone: cells.eq(2).text().trim(),
};
}).get();
};
const getNonMediaInquiries = () => {
return $('h1#non-media').next('figure.wp-block-table').find('tbody tr').map((_, row) => {
const cells = $(row).find('td');
const contactLink = cells.eq(0).find('a');
return {
department: contactLink.text().trim(),
email: contactLink.attr('href') || null,
purpose: cells.eq(1).text().trim(),
};
}).get();
}
const getCenterNewsrooms = () => {
return $('h1#center-newsrooms').next('figure.wp-block-table').find('tbody tr').map((_, row) => {
const cells = $(row).find('td');
return {
center: cells.eq(0).text().trim(),
phone: cells.eq(1).text().trim(),
};
}).get();
};
const getResources = () => {
return $('#resources .featured-link-list-row').map((_, el) => ({
title: $(el).find('h2.heading-36').text().trim(),
link: $(el).find('.grid-col-2 a').attr('href')
})).get();
};
return {
pageTitle: $('h1.heading-41').text().trim(),
mainContact: {
email: $('.hds-page-intro p.p-lg a').attr('href'),
phone: mainPhoneNumber,
},
mediaContacts: getContactsFromTable('h1#media-contacts'),
leadershipRequests: getLeadershipContacts(),
photography: getPhotographyContacts(),
nonMediaInquiries: getNonMediaInquiries(),
centerNewsrooms: getCenterNewsrooms(),
resources: getResources(),
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaPrivacy() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/nasa-web-privacy-policy-and-important-notices/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const contentArea = $('article.type-topic');
const pageTitle = contentArea.find('h1.heading-41').text().trim();
const introduction = contentArea.find('p.p-lg').first().text().trim();
const tableOfContents = contentArea.find('.hds-page-intro a[href^="#"]').map((_, el) => ({
text: $(el).find('span').text().trim(),
anchor: $(el).attr('href'),
})).get();
const sections = [];
contentArea.find('h1.wp-block-heading').each((_, heading) => {
const $heading = $(heading);
const title = $heading.text().trim();
const anchor = $heading.attr('id') || '';
const content = [];
$heading.nextUntil('h1.wp-block-heading, .wp-block-spacer').each((_, el) => {
const $el = $(el);
if ($el.is('p')) {
const text = $el.text().trim();
if (text) content.push({ type: 'paragraph', text });
} else if ($el.is('ul, ol')) {
const items = $el.find('li').map((_, li) => $(li).text().trim().replace(/\s+/g, ' ')).get();
if (items.length > 0) content.push({ type: 'list', items });
}
});
if (title && content.length > 0) {
sections.push({ title, anchor, content });
}
});
const officials = [];
const officialsHeader = contentArea.find('p:contains("NASA Officials for Privacy Related Matters")');
officialsHeader.nextAll('p').each((_, p) => {
const $p = $(p);
const title = $p.find('strong').text().trim();
const htmlContent = $p.html();
const details = htmlContent.replace(/<strong>.*?<\/strong>/g, '').split('<br>').map(line => cheerio.load(line).text().trim()).filter(Boolean);
if (title && details.length > 0) {
officials.push({ title, details });
}
});
return {
pageTitle,
introduction,
tableOfContents,
sections,
officials
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaFoia() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/foia/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }
});
const $ = cheerio.load(html);
const pageTitle = $('h1.page-heading-md').text().trim();
const introductoryText = $('.hds-page-intro p.p-lg').text().trim();
const bannerImageUrl = $('.hds-page-intro-banner figure.hds-media-background img').attr('src') || null;
const foiaOfficers = $('.hds-meet-the-card').map((i, el) => {
const element = $(el);
return {
name: element.find('h3.heading-18').text().trim(),
title: element.find('p.p-sm').text().trim(),
profileUrl: element.find('a').first().attr('href') || null,
};
}).get();
const tabTitles = $('ul.hds-tab-nav li button').map((i, el) => $(el).text().trim()).get();
const informationalTabs = $('.hds-tabbed-section-tab').map((i, el) => {
const element = $(el);
return {
tabTitle: tabTitles[i] || null,
title: element.find('h2.heading-29').text().trim(),
description: element.find('p.p-md').text().trim(),
link: element.find('a.button-primary').attr('href') || null,
imageUrl: element.find('figure.hds-media-inner img').attr('src') || null
};
}).get();
const foiaResources = $('.hds-card-grid a.hds-card-topic').map((i, el) => {
const element = $(el);
return {
title: element.find('h3.hds-topic-card-heading span').text().trim(),
link: element.attr('href'),
imageUrl: element.find('figure.hds-media-background img').attr('src') || null
};
}).get();
const additionalInformation = $('.hds-featured-link-list .featured-link-list-row').map((i, el) => {
const element = $(el);
return {
title: element.find('h2.heading-36').text().trim(),
description: element.find('p.p-md').text().trim(),
link: element.find('a').attr('href') || null,
imageUrl: element.find('figure.hds-media-background img').attr('src') || null
};
}).get();
const submitRequestSection = $('.wp-block-nasa-blocks-featured-link');
const submitRequest = {
title: submitRequestSection.find('h2.page-heading-md').text().trim(),
description: submitRequestSection.find('p.p-md').text().trim(),
link: submitRequestSection.find('a.button-primary').attr('href') || null
};
return {
pageTitle,
introductoryText,
bannerImageUrl,
foiaOfficers,
informationalTabs,
foiaResources,
additionalInformation,
submitRequest
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaNoFearAct() {
const url = 'https://www.nasa.gov/no-fear-act/';
try {
const { data: html } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
});
const $ = cheerio.load(html);
const mainContent = $('article.type-topic');
const pageTitle = mainContent.find('h1.page-heading-md').text().trim();
const description = mainContent.find('h3.wp-block-heading').first().next('p').text().trim();
const quarterlyReports = mainContent.find('h1.wp-block-heading:contains("Quarter Reports") + ul li a').map((i, el) => ({
title: $(el).text().trim(),
url: $(el).attr('href')
})).get();
const annualReportBlock = mainContent.find('.hds-featured-link-list');
const annualReport = {
title: annualReportBlock.find('h2.heading-36').text().trim(),
url: annualReportBlock.find('a').attr('href')
};
const referenceDocuments = mainContent.find('p:contains("Reference Documents:") + p a').map((i, el) => ({
title: $(el).text().trim(),
url: $(el).attr('href')
})).get();
const relatedTopics = $('div.hds-topic-cards a.topic-card').map((i, el) => ({
title: $(el).find('p.hds-topic-card-heading span').text().trim(),
url: $(el).attr('href')
})).get();
return {
pageTitle,
description,
quarterlyReports,
annualReport,
referenceDocuments,
relatedTopics
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaOig() {
const url = 'http://oig.nasa.gov/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36' }
});
const $ = cheerio.load(html);
const heroSlides = $('.hds-slider .hds-nasa-mag-wrapper').map((_, el) => {
const slide = $(el);
return {
title: slide.find('h2.display-72').text().trim(),
description: slide.find('p.maxw-tablet').text().trim(),
link: slide.find('a.usa-button--secondary').attr('href') || null,
image: slide.find('figure.hds-media-background img').attr('src') || null
};
}).get();
const latestNews = $('.latest-news-items a').map((_, el) => {
const article = $(el);
return {
title: article.find('p.heading-14, p.heading-22').text().trim(),
link: article.attr('href') || null,
image: article.find('figure.hds-media-background img').attr('src') || null,
readTime: article.find('.label').first().text().trim().replace(/\s+/g, ' ')
};
}).get();
const multimedia = $('.hds-content-lists .hds-content-item').map((_, el) => {
const item = $(el);
return {
title: item.find('.hds-a11y-heading-22').text().trim(),
link: item.find('a.hds-content-item-heading').attr('href') || item.find('a').first().attr('href') || null,
thumbnail: item.find('figure.hds-media-background img').attr('src') || null
};
}).get();
const topicCards = $('.hds-topic-cards-wrapper a.topic-card').map((_, el) => {
const card = $(el);
return {
title: card.find('p.hds-topic-card-heading span').text().trim(),
link: card.attr('href') || null,
image: card.find('figure.hds-media-background img').attr('src') || null
};
}).get();
const callouts = $('.wp-block-nasa-blocks-callout').map((_, el) => {
const callout = $(el);
return {
title: callout.find('h2.page-heading-md').text().trim(),
description: callout.find('p.p-lg').text().trim(),
link: callout.find('a.button-primary').attr('href') || null,
backgroundMedia: callout.find('video source').attr('src') || callout.find('figure img').attr('src') || null
};
}).get();
return {
sourceUrl: url,
heroSlides,
latestNews,
multimedia,
callouts,
topicCards
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaBudgets() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/budgets-plans-and-reports/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const pageTitle = $('h1.page-heading-md').text().trim();
const bannerImage = $('.hds-page-intro-banner figure.hds-media-background img').attr('src');
const featuredLinks = $('.hds-featured-link-list .featured-link-list-row').map((i, el) => {
const title = $(el).find('h2.heading-36').text().trim();
const link = $(el).find('a').attr('href');
return { title, link };
}).get();
const latestNews = $('.latest-news-items .latest-news-item a').map((i, el) => {
const title = $(el).find('p.heading-22 span').text().trim();
const link = $(el).attr('href');
const image = $(el).find('figure.hds-media-background img').attr('src');
const category = $(el).find('div > div:first-child > svg + span').text().trim();
const readTime = $(el).find('.display-block .label').text().trim();
return { title, link, image, category, readTime };
}).get();
const economicBenefitsCallout = {
title: $('.wp-block-nasa-blocks-callout h2').text().trim(),
description: $('.wp-block-nasa-blocks-callout p').text().trim(),
link: $('.wp-block-nasa-blocks-callout a').attr('href'),
backgroundImage: $('.wp-block-nasa-blocks-callout figure img').attr('src')
};
const discoverMoreTopics = $('.hds-topic-cards a.topic-card').map((i, el) => {
const title = $(el).find('p.hds-topic-card-heading span').text().trim();
const link = $(el).attr('href');
const image = $(el).find('figure.hds-media-background img').attr('src');
return { title, link, image };
}).get();
return {
pageTitle,
bannerImage,
featuredLinks,
latestNews,
economicBenefitsCallout,
discoverMoreTopics
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaFinancialReports() {
const url = 'https://www.nasa.gov/organizations/budget-annual-reports/agency-financial-reports/';
try {
const { data: html } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
const $ = cheerio.load(html);
const pageTitle = $('.hds-featured-file-list .heading-22').text().trim();
const reports = $('.hds-featured-file-list .hds-file-list-row').map((i, element) => {
const item = $(element);
const title = item.find('.hds-list-name h2').text().trim();
const publishedDate = item.find('.hds-list-date p').text().trim();
const fileInfo = item.find('.hds-file-list-filetype p').text().trim();
const downloadUrl = item.find('.hds-file-list-download a').attr('href');
return {
title,
publishedDate,
fileInfo,
downloadUrl: downloadUrl || null
};
}).get();
return {
sourceUrl: url,
pageTitle,
reports
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaContact() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/contact-nasa/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36' }
});
const $ = cheerio.load(html);
const pageTitle = $('div.hds-page-intro h1').text().trim();
const pageDescription = $('div.hds-page-intro p.p-lg').text().trim();
const addressParagraph = $('h3:contains("you may write to us at:")').next('p');
const contactAddress = addressParagraph.html()
.split('<br>')
.map(line => line.trim())
.filter(line => line);
const linkSections = $('div.hds-featured-link-list').map((_, section) => {
const sectionTitle = $(section).find('h2.heading-22').text().trim();
const links = $(section).find('.featured-link-list-row').map((_, link) => ({
title: $(link).find('h2.heading-36').text().trim(),
description: $(link).find('p.p-md').text().trim() || null,
url: $(link).find('a').attr('href') || null,
imageUrl: $(link).find('img').attr('src') || null,
})).get();
if (links.length > 0) {
return {
sectionTitle,
links
};
}
return null;
}).get().filter(Boolean);
return {
pageTitle,
pageDescription,
contactAddress,
linkSections
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaAccessibility() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/accessibility/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
});
const $ = cheerio.load(html);
const pageTitle = $('h1.display-48').text().trim();
const author = $('.article-meta-item h3.hds-meta-heading').text().trim();
const metaDescription = $('meta[name="description"]').attr('content') || null;
const ogImage = $('meta[property="og:image"]').attr('content') || null;
const footerInfo = {};
$('footer .hds-footer-meta').each((_, el) => {
const labelText = $(el).find('.hds-footer-meta-label').text().trim().toLowerCase();
const value = $(el).find('.hds-footer-meta-value').text().trim();
if (labelText.includes('page last updated')) {
footerInfo.lastUpdated = value;
} else if (labelText.includes('page editor')) {
footerInfo.pageEditor = value;
} else if (labelText.includes('responsible nasa official')) {
footerInfo.responsibleOfficial = value;
}
});
const tableOfContents = $('ul.usa-list__article-content li a').map((_, el) => ({
text: $(el).text().trim().replace(/\s+/g, ' '),
anchor: $(el).attr('href') || null,
})).get();
const content = [];
$('.entry-content h2').each((_, h2) => {
const headingElement = $(h2);
const section = {
heading: headingElement.text().trim(),
id: headingElement.attr('id') || null,
paragraphs: [],
links: [],
listItems: []
};
const contentElements = headingElement.nextUntil('h2');
contentElements.each((_, el) => {
const element = $(el);
if (element.is('p')) {
section.paragraphs.push(element.text().trim());
element.find('a').each((_, a) => {
section.links.push({
text: $(a).text().trim(),
url: $(a).attr('href') ? new URL($(a).attr('href'), 'https://www.nasa.gov').href : null
});
});
} else if (element.is('ul')) {
element.find('li').each((_, li) => {
section.listItems.push($(li).text().trim());
});
}
});
content.push(section);
});
return {
pageTitle,
author,
metadata: {
description: metaDescription,
ogImage: ogImage
},
footerInfo,
tableOfContents,
content
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaBlogs() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/nasa-blogs/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
});
const $ = cheerio.load(html);
const blogs = $('.hds-featured-link-list .featured-link-list-row').map((_, el) => {
const element = $(el);
const title = element.find('h2.heading-36').text().trim();
const link = element.find('div.grid-col-2 a').attr('href');
const imageUrl = element.find('img').attr('src');
return {
title: title || null,
link: link || null,
imageUrl: imageUrl || null,
};
}).get();
return {
pageTitle: $('h1.display-72').text().trim(),
blogs
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaVirtualGuest() {
const url = 'https://www.nasa.gov/nasa-virtual-guest-program/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
});
const $ = cheerio.load(html);
const heroSection = $('.hds-topic-hero');
const stats = heroSection.find('.hds-topic-hero-stats .grid-row .grid-col.grid-row').map((i, el) => ({
value: $(el).find('.stat-number').text().trim(),
label: $(el).find('.stat-value').text().trim(),
})).get();
const joinUsEvents = $('.hds-card-grid-cards .hds-card-custom').map((i, el) => ({
title: $(el).find('h3.heading-18').text().trim(),
description: $(el).find('p').text().trim(),
image: $(el).find('img').attr('src'),
registrationLink: $(el).find('a.button-primary').attr('href'),
})).get();
const faqs = [];
$('h1:contains("FAQs:")').nextAll('h3.wp-block-heading').each((i, el) => {
const question = $(el).text().trim();
const answer = $(el).next('p').text().trim();
if (question && answer) {
faqs.push({ question, answer });
}
});
const latestNewsRaw = $('.wp-block-nasa-blocks-news-automated a').map((i, el) => {
const article = $(el);
const title = article.find('p[class*="heading"]').text().trim();
return title ? {
title,
link: new URL(article.attr('href'), url).href,
image: article.find('img').attr('src'),
readTime: article.find('.label').first().text().trim(),
} : null;
}).get();
const latestNews = [...new Map(latestNewsRaw.map(item => [item.title, item])).values()];
return {
pageTitle: $('title').text().trim(),
hero: {
title: heroSection.find('h1').text().trim(),
backgroundImage: heroSection.find('figure img').attr('src'),
stats: stats
},
promoVideoUrl: $('figure.wp-block-embed-youtube iframe').attr('src'),
joinUsSection: {
title: $('.hds-card-grid-header h2').text().trim(),
description: $('.hds-card-grid-header p').text().trim(),
events: joinUsEvents,
},
faqs: faqs,
passport: {
title: $('.hds-featured-link-list h2.heading-36').text().trim(),
description: $('.hds-featured-link-list p.p-md').text().trim(),
link: $('.hds-featured-link-list a').attr('href'),
icon: $('.hds-featured-link-list img').attr('src'),
},
latestNews
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaPodcasts() {
const baseUrl = 'https://www.nasa.gov';
const targetUrl = `${baseUrl}/podcasts-and-audio/`;
try {
const { data: html } = await axios.get(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
const $ = cheerio.load(html);
const featuredPodcastContainer = $('.hds-featured-podcasts');
const latestEpisodeContainer = featuredPodcastContainer.find('.grid-col-12.desktop\:grid-col-6').last();
const featuredPodcast = {
seriesTitle: featuredPodcastContainer.find('h2.heading-41').text().trim(),
seriesDescription: featuredPodcastContainer.find('p.p-sm').text().trim(),
seriesLink: new URL(featuredPodcastContainer.find('a.button-primary').attr('href'), baseUrl).href,
latestEpisode: {
title: latestEpisodeContainer.find('h3.heading-18').text().trim(),
date: latestEpisodeContainer.find('.heading-12.text-uppercase').text().trim(),
link: new URL(latestEpisodeContainer.find('a.hds-link').attr('href'), baseUrl).href,
audioUrl: latestEpisodeContainer.find('audio > source').attr('src') || null,
imageUrl: latestEpisodeContainer.find('div.hds-audio-image img').attr('src') || null
}
};
const podcastCategories = $('.hds-featured-link-list .featured-link-list-row').map((_, el) => {
const element = $(el);
return {
title: element.find('h2.heading-36').text().trim(),
description: element.find('p.p-md').text().trim(),
link: new URL(element.find('a').last().attr('href'), baseUrl).href,
imageUrl: element.find('img').attr('src') || null
};
}).get();
return {
pageTitle: $('head title').text().trim(),
pageDescription: $('head meta[name="description"]').attr('content').trim(),
featuredPodcast,
podcastCategories
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaPlusSeries() {
try {
const { data: html } = await axios.get('https://plus.nasa.gov/series/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }
});
const $ = cheerio.load(html);
const seriesList = $('article.series-grid').map((i, element) => {
const seriesCard = $(element);
const linkElement = seriesCard.find('a.series-grid--link');
const title = linkElement.find('h4.series-grid--title').text().trim();
const url = linkElement.attr('href') || null;
const episodeInfo = linkElement.find('p.font-family-mono').text().trim();
const episodeCount = parseInt(episodeInfo.split(' ')[0], 10) || 0;
const figure = linkElement.find('figure.series-grid--thumbnail');
const styleAttr = figure.attr('style') || '';
const imageUrlMatch = styleAttr.match(/url\(\s*([^)]+?)\s*\)/);
const imageUrl = imageUrlMatch ? imageUrlMatch[1].trim() : null;
return {
title,
url,
imageUrl,
episodeCount,
};
}).get();
return {
source: 'NASA+ Series',
pageUrl: 'https://plus.nasa.gov/series/',
series: seriesList,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaRecentlyPublished() {
const url = 'https://www.nasa.gov/news/recently-published/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const articles = $('.hds-content-item').map((index, element) => {
const el = $(element);
const title = el.find('.hds-content-item-heading').text().trim();
const link = el.find('.hds-content-item-heading').attr('href');
const imageUrl = el.find('.hds-content-item-thumbnail img').attr('src');
const readTime = el.find('.hds-content-item-readtime').text().trim() || null;
const excerpt = el.find('.hds-content-item-inner > p').text().trim();
const contentType = el.find('.hds-content-item-inner > div.label span').text().trim();
return {
title,
link,
imageUrl,
readTime,
excerpt,
contentType
};
}).get();
return {
source: 'NASA Recently Published',
url,
totalArticles: articles.length,
articles
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaNewsReleases(year = new Date().getFullYear()) {
try {
if (typeof year !== 'number') {
const parsedYear = parseInt(year);
if (isNaN(parsedYear)) {
throw new Error('Year must be a valid number');
}
year = parsedYear;
}
const currentYear = new Date().getFullYear();
if (year < 1990 || year > currentYear) {
throw new Error(`Year must be between 1990 and ${currentYear}`);
}
const axiosInstance = axios.create({
timeout: 30000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
const url = `https://www.nasa.gov/${year}-news-releases/`;
let response;
try {
response = await axiosInstance.get(url);
} catch (axiosError) {
if (axiosError.response) {
if (axiosError.response.status === 404) {
throw new Error(`NASA page not found for year ${year}`);
}
throw new Error(`HTTP Error ${axiosError.response.status}`);
} else if (axiosError.code === 'ECONNREFUSED') {
throw new Error('Connection refused to NASA website');
} else if (axiosError.code === 'ETIMEDOUT') {
throw new Error('Request timeout to NASA website');
} else if (axiosError.code === 'ENOTFOUND') {
throw new Error('NASA website domain not found');
} else {
throw new Error(`Network error: ${axiosError.message}`);
}
}
if (!response.data || typeof response.data !== 'string') {
throw new Error('Invalid response from NASA website');
}
if (response.status !== 200) {
throw new Error(`HTTP Error ${response.status}`);
}
let $;
try {
$ = cheerio.load(response.data);
} catch (cheerioError) {
throw new Error('Failed to parse HTML content');
}
if ($('body').length === 0) {
throw new Error('Invalid HTML structure');
}
const pageTitle = $('h2.section-heading-sm').text().trim() || `NASA ${year} News Releases`;
const newsItems = [];
$('.hds-content-item').each((index, element) => {
try {
const el = $(element);
const titleElement = el.find('a.hds-content-item-heading');
const title = titleElement.text().trim();
if (!title) return;
let itemUrl = titleElement.attr('href');
const excerpt = el.find('div.hds-content-item-inner p').text().trim();
const imageUrl = el.find('a.hds-content-item-thumbnail img').attr('src');
const readTime = el.find('.hds-content-item-readtime').text().trim().replace(/\s\s+/g, ' ');
const type = el.find('div.display-flex.flex-align-center.label span').text().trim();
if (itemUrl && !itemUrl.startsWith('http')) {
itemUrl = `https://www.nasa.gov${itemUrl}`;
}
newsItems.push({
title,
url: itemUrl || null,
excerpt: excerpt || 'No excerpt available',
imageUrl: imageUrl || null,
readTime: readTime || 'Unknown',
type: type || 'General'
});
} catch (itemError) {
console.warn(`Warning: Failed to parse news item ${index}`);
}
});
if (newsItems.length === 0) {
console.warn(`Warning: No news items found for ${year}`);
}
return {
success: true,
source: url,
pageTitle,
newsReleases: newsItems,
year,
scrapedAt: new Date().toISOString(),
totalItems: newsItems.length
};
} catch (error) {
return {
success: false,
error: error.message,
year: year,
timestamp: new Date().toISOString(),
newsReleases: []
};
}
}
async function scrapeNasaImages() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/images/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
});
const $ = cheerio.load(html);
const imageOfTheDaySection = $('.hds-image-of-the-day');
const imageOfTheDay = {
title: imageOfTheDaySection.find('p.heading-22').text().trim(),
description: imageOfTheDaySection.find('p.p-md').text().trim(),
imageUrl: imageOfTheDaySection.find('figure img').attr('src'),
pageUrl: imageOfTheDaySection.find('.under-image-button a').attr('href')
};
const historySection = $('.wp-block-nasa-blocks-callout');
const nasaHistory = {
title: historySection.find('h2.page-heading-md').text().trim(),
description: historySection.find('p.p-lg').text().trim(),
link: historySection.find('a.button-primary').attr('href'),
backgroundImage: historySection.find('figure img').attr('src')
};
const imageResources = $('.hds-featured-link-list .featured-link-list-row').map((_, el) => {
const element = $(el);
return {
title: element.find('h2.heading-36').text().trim(),
link: element.find('a').attr('href')
};
}).get();
const gallerySection = $('.hds-gallery-preview');
const featuredGallery = {
title: gallerySection.find('h2.heading-22').text().trim(),
metadata: gallerySection.find('.hds-gallery-preview-label').text().trim().replace(/\s\s+/g, ' '),
galleryLink: gallerySection.find('.desktop\:grid-col-4 a.button-primary').attr('href'),
images: gallerySection.find('.hds-gallery-preview-item img').map((_, img) => $(img).attr('src')).get()
};
const discoverMore = $('.hds-topic-cards a.topic-card').map((_, el) => {
const element = $(el);
return {
title: element.find('.hds-topic-card-heading span').text().trim(),
link: element.attr('href'),
image: element.find('figure img').attr('src')
};
}).get();
return {
pageTitle: $('.hds-topic-hero h1').text().trim(),
pageDescription: $('.hds-topic-hero .p-lg').text().trim(),
heroImage: $('.hds-topic-hero figure img').attr('src'),
imageOfTheDay,
nasaHistory,
imageResources,
featuredGallery,
discoverMore
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaLive() {
const url = 'https://www.nasa.gov/live';
try {
const { data: html } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
const $ = cheerio.load(html);
const featuredVideoContainer = $('.hds-featured-video');
const featuredVideo = {
title: featuredVideoContainer.find('h3.heading-36').text().trim(),
description: featuredVideoContainer.find('p.color-carbon-20').text().trim(),
link: featuredVideoContainer.find('a.button-primary').attr('href') || '',
embedUrl: featuredVideoContainer.find('.hds-embed-container iframe').attr('src') || ''
};
const upcomingEventsStatus = $('h1.wp-block-heading:contains("Upcoming Events")').nextAll('p').first().text().trim();
const watchOptions = $('.hds-featured-link-list .featured-link-list-row').map((_, el) => {
const element = $(el);
return {
title: element.find('h2.heading-36').text().trim(),
description: element.find('p.p-md').text().trim(),
link: element.find('a').attr('href') || '',
iconUrl: element.find('img').attr('src') || ''
};
}).get();
return {
pageTitle: $('title').text().trim(),
featuredVideo,
upcomingEventsStatus,
watchOptions
};
} catch (error) {
throw new Error(`Scraping failed for ${url}: ${error.message}`);
}
}
async function scrapeNasaApps() {
const url = 'https://www.nasa.gov/apps/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' }
});
const $ = cheerio.load(html);
const featuredApps = [];
const mainFeaturedAppContainer = $('.wp-block-nasa-blocks-featured-link');
if (mainFeaturedAppContainer.length) {
featuredApps.push({
title: mainFeaturedAppContainer.find('h2.page-heading-md').text().trim(),
description: mainFeaturedAppContainer.find('p.p-md').text().trim(),
link: mainFeaturedAppContainer.find('a.button-primary').attr('href') || null,
imageUrl: mainFeaturedAppContainer.find('figure img').attr('src') || null,
});
}
const spotTheStationContainer = $('.wp-block-nasa-blocks-story');
if (spotTheStationContainer.length) {
featuredApps.push({
title: spotTheStationContainer.find('h2.display-48').text().trim(),
category: spotTheStationContainer.find('h3.subtitle-md').text().trim(),
description: spotTheStationContainer.find('p.p-md').first().text().trim(),
link: spotTheStationContainer.find('a.button-primary').attr('href') || null,
imageUrl: spotTheStationContainer.find('figure img').attr('src') || null,
});
}
const otherApps = [];
$('div.entry-content h3.wp-block-heading').each((_, element) => {
const title = $(element).text().trim();
const linksParagraph = $(element).next('p');
if (linksParagraph.length) {
const storeLinks = linksParagraph.find('a').map((_, linkEl) => ({
storeName: $(linkEl).text().trim(),
url: $(linkEl).attr('href') || null
})).get();
if (storeLinks.length > 0) {
otherApps.push({
title,
storeLinks
});
}
}
});
return {
pageTitle: $('h1.page-heading-md').text().trim(),
featuredApps,
otherApps,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaPodcasts() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/podcasts/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }
});
const $ = cheerio.load(html);
const pageTitle = $('h1.display-100').text().trim();
const pageDescription = $('div.grid-container-widescreen > p.p-lg').text().trim();
const podcasts = $('div.hds-content-item').map((_, element) => {
const card = $(element);
const title = card.find('h3.heading-22').text().trim();
const link = card.find('a.hds-content-item-heading').attr('href');
const imageUrl = card.find('a.hds-content-item-thumbnail img').attr('src');
const description = card.find('div.hds-content-item-inner p').text().trim();
return {
title,
link,
imageUrl,
description,
};
}).get();
return {
source: 'NASA Podcasts',
pageTitle,
pageDescription,
podcasts,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaImageOfTheDay() {
try {
const { data: html } = await axios.get('https://www.nasa.gov/image-of-the-day/', {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }
});
const $ = cheerio.load(html);
const pageTitle = $('h1.heading-22').text().trim();
const images = $('.hds-gallery-item-single.hds-gallery-image').map((_, el) => {
const element = $(el);
const linkElement = element.find('a.hds-gallery-item-link');
const imgElement = linkElement.find('img');
const articleUrl = linkElement.attr('href') || '';
const imageUrl = imgElement.attr('src') || '';
const altText = imgElement.attr('alt') || '';
const caption = element.find('.hds-gallery-item-caption').text().trim();
return {
articleUrl,
imageUrl,
altText,
caption
};
}).get();
return {
pageTitle,
totalImages: images.length,
images,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaEbooks() {
const targetUrl = 'https://www.nasa.gov/ebooks/';
try {
const { data: html } = await axios.get(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
});
const $ = cheerio.load(html);
const extractBooksFromSection = (sectionSelector) => {
return $(sectionSelector).find('.hds-card-custom').map((_, el) => {
const title = $(el).find('h3.heading-18').text().trim();
const imageUrl = $(el).find('.hds-card-thumbnail img').attr('src');
const overviewUrl = $(el).find('a.button-primary').attr('href');
return {
title,
imageUrl: imageUrl || null,
overviewUrl: overviewUrl || null,
};
}).get();
};
const aeronautics = extractBooksFromSection('#aeronautics');
const history = extractBooksFromSection('#history');
const hubble = extractBooksFromSection('#hubble');
return {
aeronautics,
history,
hubble,
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaInteractives() {
const targetUrl = 'https://www.nasa.gov/interactives/';
try {
const { data: html } = await axios.get(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
});
const $ = cheerio.load(html);
const sections = [];
$('div.hds-card-grid').each((i, sectionElem) => {
const sectionTitle = $(sectionElem).find('h2.section-heading-sm').text().trim();
const sectionDescription = $(sectionElem).find('.hds-card-grid-header > p').text().trim();
const items = $(sectionElem).find('.hds-card-custom').map((_, cardElem) => {
const card = $(cardElem);
const title = card.find('h3.heading-18').text().trim();
const description = card.find('p.line-height-lg').text().trim();
const link = card.find('a.button-primary').attr('href') || null;
const imageUrl = card.find('.hds-card-thumbnail img').attr('src') || null;
return {
title,
description,
link: link ? (link.startsWith('http') ? link : new URL(link, targetUrl).href) : null,
imageUrl,
};
}).get();
sections.push({
section: sectionTitle || 'Games and Interactives',
description: sectionDescription || null,
items
});
});
return {
pageTitle: $('h1.page-heading-md').text().trim(),
sections
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
async function scrapeNasaStemMultimedia() {
const targetUrl = 'https://www.nasa.gov/learning-resources/search/?terms=8058%2C8059%2C8061%2C8062%2C8068';
try {
const { data: html } = await axios.get(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
});
const $ = cheerio.load(html);
const scriptContent = $('#nasa-hds-faceted-filter-js-extra').html();
const jsonString = scriptContent.substring(scriptContent.indexOf('{'), scriptContent.lastIndexOf('}') + 1);
const pageData = JSON.parse(jsonString);
const resources = pageData.results.posts.map(post => {
const $image = cheerio.load(post.image);
const imageUrl = $image('img').attr('src') || '';
return {
title: post.title || null,
link: post.link || null,
description: post.desc || null,
imageUrl: imageUrl,
};
});
return {
totalResults: pageData.results.results,
totalPages: pageData.results.pages,
resources: resources,
};
} catch (error) {
throw new Error(`Scraping failed for ${targetUrl}: ${error.message}`);
}
}
async function scrapeNasaBrandCenter() {
const url = 'https://www.nasa.gov/nasa-brand-center/';
try {
const { data: html } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
});
const $ = cheerio.load(html);
const pageIntro = $('.wp-block-nasa-blocks-page-intro').first();
const pageTitle = pageIntro.find('h1.page-heading-md').text().trim();
const pageDescription = pageIntro.find('p.p-lg').text().trim();
const bannerImage = pageIntro.find('.hds-media-background img').attr('src');
const guidelineCards = $('.hds-card-grid .hds-card-custom').map((_, el) => {
const element = $(el);
return {
title: element.find('h3.heading-18').text().trim(),
description: element.find('p.line-height-lg').text().trim(),
link: element.find('a.button-primary').attr('href'),
image: element.find('.hds-card-thumbnail img').attr('src')
};
}).get();
const graphicStandardsSection = $('.wp-block-nasa-blocks-story');
const graphicStandards = {
subtitle: graphicStandardsSection.find('h3.subtitle-md').text().trim(),
title: graphicStandardsSection.find('h2.display-48').text().trim(),
summary: graphicStandardsSection.find('p.heading-18').text().trim(),
description: graphicStandardsSection.find('p.p-md').text().trim(),
image: {
src: graphicStandardsSection.find('img').attr('src'),
alt: graphicStandardsSection.find('img').attr('alt'),
caption: graphicStandardsSection.find('.hds-caption-text').text().trim(),
credits: graphicStandardsSection.find('.hds-credits').text().trim()
}
};
const additionalGuidelines = $('h2.wp-block-heading[id]').map((_, el) => {
const headingElement = $(el);
const title = headingElement.text().trim();
const content = headingElement.nextUntil('h2.wp-block-heading, .wp-block-spacer + h2.wp-block-heading')
.filter('p')
.map((_, p) => $(p).text().trim().replace(/\n/g, ' '))
.get()
.filter(text => text.length > 0 && text !== 'NOTE:');
return { title, content };
}).get();
return {
pageTitle,
pageDescription,
bannerImage,
guidelineCards,
graphicStandards,
additionalGuidelines
};
} catch (error) {
throw new Error(`Scraping failed: ${error.message}`);
}
}
export {
scrapeNasa,
scrapeNasaPlus,
scrapeNasaNews,
scrapeNasaEvents,
scrapeNasaLaunches,
scrapeNasaMultimedia,
scrapeNasaMissions,
scrapeNasaMoreMissions,
scrapeNasaHumansInSpace,
scrapeNasaScience,
scrapeNasaScienceEarth,
scrapeNasaScienceSolarSystem,
scrapeNasaScienceUniverse,
scrapeNasaAeronautics,
scrapeNasaTechnology,
scrapeNasaLearningResources,
scrapeNasaAbout,
scrapeNasaEspanol,
scrapeNasaSocialMedia,
scrapeNasaNewsletters,
scrapeNasaGetInvolved,
scrapeNasaSitemap,
scrapeNasaMediaContacts,
scrapeNasaPrivacy,
scrapeNasaFoia,
scrapeNasaNoFearAct,
scrapeNasaOig,
scrapeNasaBudgets,
scrapeNasaFinancialReports,
scrapeNasaContact,
scrapeNasaAccessibility,
scrapeNasaBlogs,
scrapeNasaVirtualGuest,
scrapeNasaPodcasts,
scrapeNasaPlusSeries,
scrapeNasaRecentlyPublished,
scrapeNasaNewsReleases,
scrapeNasaImages,
scrapeNasaLive,
scrapeNasaApps,
scrapeNasaPodcasts,
scrapeNasaImageOfTheDay,
scrapeNasaEbooks,
scrapeNasaInteractives,
scrapeNasaStemMultimedia,
scrapeNasaBrandCenter
};
// 4:29 -- https://github.com/Frenzycore --
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment