import http from "node:http"; import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { Readability } from "@mozilla/readability"; import { JSDOM } from "jsdom"; chromium.use(StealthPlugin()); const PORT = parseInt(process.env.PORT || "3100"); const VIEWPORT = { width: 1440, height: 900 }; const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"; const NAV_TIMEOUT = 30_000; const SCREENSHOT_TIMEOUT = 8_000; let browser = null; async function ensureBrowser() { if (browser && browser.isConnected()) return browser; if (browser) { try { await browser.close(); } catch {} browser = null; } console.log("[crawler] Launching browser..."); browser = await chromium.launch({ headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-gpu", ], }); console.log("[crawler] Browser ready"); return browser; } // Extract og:image and other meta from rendered HTML function extractMeta(html) { const meta = {}; const patterns = { og_image: [ /(?:property|name)=["']og:image["'][^>]*content=["']([^"']+)["']/i, /content=["']([^"']+)["'][^>]*(?:property|name)=["']og:image["']/i, ], title: [ /(?:property|name)=["']og:title["'][^>]*content=["']([^"']+)["']/i, /content=["']([^"']+)["'][^>]*(?:property|name)=["']og:title["']/i, /]*>([^<]+)<\/title>/i, ], description: [ /(?:property|name)=["']og:description["'][^>]*content=["']([^"']+)["']/i, /name=["']description["'][^>]*content=["']([^"']+)["']/i, /content=["']([^"']+)["'][^>]*(?:property|name)=["']og:description["']/i, ], author: [ /name=["']author["'][^>]*content=["']([^"']+)["']/i, /property=["']article:author["'][^>]*content=["']([^"']+)["']/i, ], favicon: [ /rel=["']icon["'][^>]*href=["']([^"']+)["']/i, /rel=["']shortcut icon["'][^>]*href=["']([^"']+)["']/i, ], }; for (const [key, pats] of Object.entries(patterns)) { for (const pat of pats) { const m = html.match(pat); if (m) { meta[key] = m[1].trim(); break; } } } return meta; } function isRedditUrl(url) { try { const h = new URL(url).hostname; return h === "www.reddit.com" || h === "reddit.com"; } catch {} return false; } async function resolveRedditShortUrl(url) { // Reddit short URLs (/r/sub/s/xxx) redirect to the actual post if (/\/s\/[a-zA-Z0-9]+/.test(url)) { try { const resp = await fetch(url, { method: "HEAD", redirect: "follow", headers: { "User-Agent": "SecondBrain/1.0" }, }); const resolved = resp.url; if (resolved && resolved.includes("/comments/")) { console.log(`[crawler] Reddit short URL resolved: ${url} -> ${resolved}`); return resolved; } } catch (e) { console.warn("[crawler] Reddit short URL resolve failed:", e.message); } } return url; } async function fetchRedditJson(url) { // Resolve short URLs first url = await resolveRedditShortUrl(url); // Reddit JSON API — append .json to get structured data try { const jsonUrl = url.replace(/\/?(\?.*)?$/, "/.json$1"); const resp = await fetch(jsonUrl, { headers: { "User-Agent": "SecondBrain/1.0" }, redirect: "follow", }); if (!resp.ok) return null; const data = await resp.json(); const post = data?.[0]?.data?.children?.[0]?.data; if (!post) return null; const previewImg = (post.preview?.images?.[0]?.source?.url || "").replace(/&/g, "&") || null; const thumbnail = post.thumbnail?.startsWith("http") ? post.thumbnail : null; // If no preview image, try to get subreddit icon let ogImage = previewImg || thumbnail || null; if (!ogImage && post.subreddit) { try { const aboutResp = await fetch( `https://www.reddit.com/r/${post.subreddit}/about.json`, { headers: { "User-Agent": "SecondBrain/1.0" } } ); if (aboutResp.ok) { const about = await aboutResp.json(); const icon = about?.data?.community_icon?.replace(/&/g, "&")?.split("?")?.[0] || about?.data?.icon_img || about?.data?.header_img; if (icon && icon.startsWith("http")) { ogImage = icon; } } } catch {} } return { url, html: null, text: `${post.title || ""}\n\n${post.selftext || ""}`.trim(), title: post.title || null, description: (post.selftext || "").slice(0, 200) || null, author: post.author ? `u/${post.author}` : null, og_image_url: ogImage ? ogImage.replace(/&/g, "&") : null, favicon: null, screenshot: null, status_code: 200, error: null, subreddit: post.subreddit_name_prefixed || null, }; } catch (e) { console.warn("[crawler] Reddit JSON failed:", e.message); return null; } } async function crawl(url) { // Reddit: use JSON API (avoids login walls entirely) if (isRedditUrl(url)) { const redditData = await fetchRedditJson(url); if (redditData) { console.log(`[crawler] Reddit JSON OK: ${url} (og=${!!redditData.og_image_url})`); return redditData; } console.log(`[crawler] Reddit JSON failed, falling back to browser: ${url}`); } const crawlUrl = url; let b; try { b = await ensureBrowser(); } catch (e) { console.error("[crawler] Browser launch failed, retrying:", e.message); browser = null; b = await ensureBrowser(); } const contextOpts = { viewport: VIEWPORT, userAgent: USER_AGENT, ignoreHTTPSErrors: true, }; // Reddit: set cookies to bypass login walls if (isRedditUrl(url)) { contextOpts.extraHTTPHeaders = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", }; } const context = await b.newContext(contextOpts); const page = await context.newPage(); const result = { url, html: null, text: null, readable_html: null, title: null, description: null, author: null, og_image_url: null, favicon: null, screenshot: null, // base64 status_code: null, error: null, }; try { // Navigate (use normalized URL to avoid login walls) const response = await page.goto(crawlUrl, { waitUntil: "domcontentloaded", timeout: NAV_TIMEOUT, }); result.status_code = response?.status() || null; // Wait for network to settle (up to 5s) try { await page.waitForLoadState("networkidle", { timeout: 5000 }); } catch { // networkidle timeout is fine, page is probably loaded enough } // Reddit: dismiss login modals and overlays if (isRedditUrl(url)) { await page.evaluate(() => { // Remove login modal/overlay document.querySelectorAll('shreddit-overlay-display, [id*="login"], .overlay-container, reddit-cookie-banner').forEach(el => el.remove()); // Remove any body scroll locks document.body.style.overflow = 'auto'; document.documentElement.style.overflow = 'auto'; }).catch(() => {}); await page.waitForTimeout(1000); } // Get rendered HTML + screenshot in parallel const [html, screenshot] = await Promise.all([ page.content(), page .screenshot({ type: "jpeg", quality: 80, fullPage: false }) .catch((e) => { console.warn("[crawler] Screenshot failed:", e.message); return null; }), ]); result.html = html; // Extract text from page result.text = await page .evaluate(() => { const el = document.querySelector("article") || document.querySelector("main") || document.querySelector('[role="main"]') || document.body; return el ? el.innerText.slice(0, 10000) : ""; }) .catch(() => ""); // Extract readable article HTML via Mozilla Readability try { const dom = new JSDOM(html, { url: crawlUrl }); const reader = new Readability(dom.window.document); const article = reader.parse(); if (article && article.content) { result.readable_html = article.content; if (article.textContent) { result.text = article.textContent.slice(0, 10000); } } } catch (e) { console.warn("[crawler] Readability failed:", e.message); } // Extract meta from rendered DOM const meta = extractMeta(html); result.title = meta.title || (await page.title()) || null; result.description = meta.description || null; result.author = meta.author || null; result.og_image_url = meta.og_image || null; result.favicon = meta.favicon || null; // Screenshot as base64 if (screenshot) { result.screenshot = screenshot.toString("base64"); } } catch (e) { result.error = e.message; console.error("[crawler] Crawl error:", url, e.message); // If browser crashed, reset it for next request if (e.message.includes("closed") || e.message.includes("crashed")) { browser = null; } } finally { await page.close().catch(() => {}); await context.close().catch(() => {}); } return result; } // Simple HTTP server const server = http.createServer(async (req, res) => { // Health check if (req.method === "GET" && req.url === "/health") { res.writeHead(200, { "Content-Type": "application/json" }); res.end(JSON.stringify({ status: "ok" })); return; } // Crawl endpoint if (req.method === "POST" && req.url === "/crawl") { let body = ""; req.on("data", (chunk) => (body += chunk)); req.on("end", async () => { try { const { url } = JSON.parse(body); if (!url) { res.writeHead(400, { "Content-Type": "application/json" }); res.end(JSON.stringify({ error: "url is required" })); return; } console.log(`[crawler] Crawling: ${url}`); const result = await crawl(url); console.log( `[crawler] Done: ${url} (status=${result.status_code}, og=${!!result.og_image_url}, ss=${!!result.screenshot})` ); res.writeHead(200, { "Content-Type": "application/json" }); res.end(JSON.stringify(result)); } catch (e) { console.error("[crawler] Request error:", e); res.writeHead(500, { "Content-Type": "application/json" }); res.end(JSON.stringify({ error: e.message })); } }); return; } res.writeHead(404); res.end("Not found"); }); // Startup (async () => { await ensureBrowser(); server.listen(PORT, () => { console.log(`[crawler] Listening on :${PORT}`); }); })(); // Graceful shutdown process.on("SIGTERM", async () => { console.log("[crawler] Shutting down..."); if (browser) await browser.close().catch(() => {}); process.exit(0); });