Files
platform/services/brain/crawler/server.js
Yusuf Suleman 4592e35732
All checks were successful
Security Checks / dependency-audit (push) Successful in 1m13s
Security Checks / secret-scanning (push) Successful in 3s
Security Checks / dockerfile-lint (push) Successful in 3s
feat: major platform expansion — Brain service, RSS reader, iOS app, AI assistants, Firefox extension
Brain Service:
- Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API)
- AI classification with tag definitions and folder assignment
- YouTube video download via yt-dlp
- Karakeep migration complete (96 items)
- Taxonomy management (folders with icons/colors, tags)
- Discovery shuffle, sort options, search (Meilisearch + pgvector)
- Item tag/folder editing, card color accents

RSS Reader Service:
- Custom FastAPI reader replacing Miniflux
- Feed management (add/delete/refresh), category support
- Full article extraction via Readability
- Background content fetching for new entries
- Mark all read with confirmation
- Infinite scroll, retention cleanup (30/60 day)
- 17 feeds migrated from Miniflux

iOS App (SwiftUI):
- Native iOS 17+ app with @Observable architecture
- Cookie-based auth, configurable gateway URL
- Dashboard with custom background photo + frosted glass widgets
- Full fitness module (today/templates/goals/food library)
- AI assistant chat (fitness + brain, raw JSON state management)
- 120fps ProMotion support

AI Assistants (Gateway):
- Unified dispatcher with fitness/brain domain detection
- Fitness: natural language food logging, photo analysis, multi-item splitting
- Brain: save/append/update/delete notes, search & answer, undo support
- Madiha user gets fitness-only (brain disabled)

Firefox Extension:
- One-click save to Brain from any page
- Login with platform credentials
- Right-click context menu (save page/link/image)
- Notes field for URL saves
- Signed and published on AMO

Other:
- Reader bookmark button routes to Brain (was Karakeep)
- Fitness food library with "Add" button + add-to-meal popup
- Kindle send file size check (25MB SMTP2GO limit)
- Atelier UI as default (useAtelierShell=true)
- Mobile upload box in nav drawer

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 00:56:29 -05:00

371 lines
11 KiB
JavaScript

import http from "node:http";
import { chromium } from "playwright-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
chromium.use(StealthPlugin());
const PORT = parseInt(process.env.PORT || "3100");
const VIEWPORT = { width: 1440, height: 900 };
const USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
const NAV_TIMEOUT = 30_000;
const SCREENSHOT_TIMEOUT = 8_000;
let browser = null;
async function ensureBrowser() {
if (browser && browser.isConnected()) return browser;
if (browser) {
try { await browser.close(); } catch {}
browser = null;
}
console.log("[crawler] Launching browser...");
browser = await chromium.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
],
});
console.log("[crawler] Browser ready");
return browser;
}
// Extract og:image and other meta from rendered HTML
function extractMeta(html) {
const meta = {};
const patterns = {
og_image: [
/(?:property|name)=["']og:image["'][^>]*content=["']([^"']+)["']/i,
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:image["']/i,
],
title: [
/(?:property|name)=["']og:title["'][^>]*content=["']([^"']+)["']/i,
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:title["']/i,
/<title[^>]*>([^<]+)<\/title>/i,
],
description: [
/(?:property|name)=["']og:description["'][^>]*content=["']([^"']+)["']/i,
/name=["']description["'][^>]*content=["']([^"']+)["']/i,
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:description["']/i,
],
author: [
/name=["']author["'][^>]*content=["']([^"']+)["']/i,
/property=["']article:author["'][^>]*content=["']([^"']+)["']/i,
],
favicon: [
/rel=["']icon["'][^>]*href=["']([^"']+)["']/i,
/rel=["']shortcut icon["'][^>]*href=["']([^"']+)["']/i,
],
};
for (const [key, pats] of Object.entries(patterns)) {
for (const pat of pats) {
const m = html.match(pat);
if (m) {
meta[key] = m[1].trim();
break;
}
}
}
return meta;
}
function isRedditUrl(url) {
try {
const h = new URL(url).hostname;
return h === "www.reddit.com" || h === "reddit.com";
} catch {}
return false;
}
async function resolveRedditShortUrl(url) {
// Reddit short URLs (/r/sub/s/xxx) redirect to the actual post
if (/\/s\/[a-zA-Z0-9]+/.test(url)) {
try {
const resp = await fetch(url, {
method: "HEAD",
redirect: "follow",
headers: { "User-Agent": "SecondBrain/1.0" },
});
const resolved = resp.url;
if (resolved && resolved.includes("/comments/")) {
console.log(`[crawler] Reddit short URL resolved: ${url} -> ${resolved}`);
return resolved;
}
} catch (e) {
console.warn("[crawler] Reddit short URL resolve failed:", e.message);
}
}
return url;
}
async function fetchRedditJson(url) {
// Resolve short URLs first
url = await resolveRedditShortUrl(url);
// Reddit JSON API — append .json to get structured data
try {
const jsonUrl = url.replace(/\/?(\?.*)?$/, "/.json$1");
const resp = await fetch(jsonUrl, {
headers: { "User-Agent": "SecondBrain/1.0" },
redirect: "follow",
});
if (!resp.ok) return null;
const data = await resp.json();
const post = data?.[0]?.data?.children?.[0]?.data;
if (!post) return null;
const previewImg = (post.preview?.images?.[0]?.source?.url || "").replace(/&amp;/g, "&") || null;
const thumbnail = post.thumbnail?.startsWith("http") ? post.thumbnail : null;
// If no preview image, try to get subreddit icon
let ogImage = previewImg || thumbnail || null;
if (!ogImage && post.subreddit) {
try {
const aboutResp = await fetch(
`https://www.reddit.com/r/${post.subreddit}/about.json`,
{ headers: { "User-Agent": "SecondBrain/1.0" } }
);
if (aboutResp.ok) {
const about = await aboutResp.json();
const icon = about?.data?.community_icon?.replace(/&amp;/g, "&")?.split("?")?.[0]
|| about?.data?.icon_img
|| about?.data?.header_img;
if (icon && icon.startsWith("http")) {
ogImage = icon;
}
}
} catch {}
}
return {
url,
html: null,
text: `${post.title || ""}\n\n${post.selftext || ""}`.trim(),
title: post.title || null,
description: (post.selftext || "").slice(0, 200) || null,
author: post.author ? `u/${post.author}` : null,
og_image_url: ogImage ? ogImage.replace(/&amp;/g, "&") : null,
favicon: null,
screenshot: null,
status_code: 200,
error: null,
subreddit: post.subreddit_name_prefixed || null,
};
} catch (e) {
console.warn("[crawler] Reddit JSON failed:", e.message);
return null;
}
}
async function crawl(url) {
// Reddit: use JSON API (avoids login walls entirely)
if (isRedditUrl(url)) {
const redditData = await fetchRedditJson(url);
if (redditData) {
console.log(`[crawler] Reddit JSON OK: ${url} (og=${!!redditData.og_image_url})`);
return redditData;
}
console.log(`[crawler] Reddit JSON failed, falling back to browser: ${url}`);
}
const crawlUrl = url;
let b;
try {
b = await ensureBrowser();
} catch (e) {
console.error("[crawler] Browser launch failed, retrying:", e.message);
browser = null;
b = await ensureBrowser();
}
const contextOpts = {
viewport: VIEWPORT,
userAgent: USER_AGENT,
ignoreHTTPSErrors: true,
};
// Reddit: set cookies to bypass login walls
if (isRedditUrl(url)) {
contextOpts.extraHTTPHeaders = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
};
}
const context = await b.newContext(contextOpts);
const page = await context.newPage();
const result = {
url,
html: null,
text: null,
readable_html: null,
title: null,
description: null,
author: null,
og_image_url: null,
favicon: null,
screenshot: null, // base64
status_code: null,
error: null,
};
try {
// Navigate (use normalized URL to avoid login walls)
const response = await page.goto(crawlUrl, {
waitUntil: "domcontentloaded",
timeout: NAV_TIMEOUT,
});
result.status_code = response?.status() || null;
// Wait for network to settle (up to 5s)
try {
await page.waitForLoadState("networkidle", { timeout: 5000 });
} catch {
// networkidle timeout is fine, page is probably loaded enough
}
// Reddit: dismiss login modals and overlays
if (isRedditUrl(url)) {
await page.evaluate(() => {
// Remove login modal/overlay
document.querySelectorAll('shreddit-overlay-display, [id*="login"], .overlay-container, reddit-cookie-banner').forEach(el => el.remove());
// Remove any body scroll locks
document.body.style.overflow = 'auto';
document.documentElement.style.overflow = 'auto';
}).catch(() => {});
await page.waitForTimeout(1000);
}
// Get rendered HTML + screenshot in parallel
const [html, screenshot] = await Promise.all([
page.content(),
page
.screenshot({ type: "jpeg", quality: 80, fullPage: false })
.catch((e) => {
console.warn("[crawler] Screenshot failed:", e.message);
return null;
}),
]);
result.html = html;
// Extract text from page
result.text = await page
.evaluate(() => {
const el =
document.querySelector("article") ||
document.querySelector("main") ||
document.querySelector('[role="main"]') ||
document.body;
return el ? el.innerText.slice(0, 10000) : "";
})
.catch(() => "");
// Extract readable article HTML via Mozilla Readability
try {
const dom = new JSDOM(html, { url: crawlUrl });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (article && article.content) {
result.readable_html = article.content;
if (article.textContent) {
result.text = article.textContent.slice(0, 10000);
}
}
} catch (e) {
console.warn("[crawler] Readability failed:", e.message);
}
// Extract meta from rendered DOM
const meta = extractMeta(html);
result.title = meta.title || (await page.title()) || null;
result.description = meta.description || null;
result.author = meta.author || null;
result.og_image_url = meta.og_image || null;
result.favicon = meta.favicon || null;
// Screenshot as base64
if (screenshot) {
result.screenshot = screenshot.toString("base64");
}
} catch (e) {
result.error = e.message;
console.error("[crawler] Crawl error:", url, e.message);
// If browser crashed, reset it for next request
if (e.message.includes("closed") || e.message.includes("crashed")) {
browser = null;
}
} finally {
await page.close().catch(() => {});
await context.close().catch(() => {});
}
return result;
}
// Simple HTTP server
const server = http.createServer(async (req, res) => {
// Health check
if (req.method === "GET" && req.url === "/health") {
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({ status: "ok" }));
return;
}
// Crawl endpoint
if (req.method === "POST" && req.url === "/crawl") {
let body = "";
req.on("data", (chunk) => (body += chunk));
req.on("end", async () => {
try {
const { url } = JSON.parse(body);
if (!url) {
res.writeHead(400, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: "url is required" }));
return;
}
console.log(`[crawler] Crawling: ${url}`);
const result = await crawl(url);
console.log(
`[crawler] Done: ${url} (status=${result.status_code}, og=${!!result.og_image_url}, ss=${!!result.screenshot})`
);
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify(result));
} catch (e) {
console.error("[crawler] Request error:", e);
res.writeHead(500, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: e.message }));
}
});
return;
}
res.writeHead(404);
res.end("Not found");
});
// Startup
(async () => {
await ensureBrowser();
server.listen(PORT, () => {
console.log(`[crawler] Listening on :${PORT}`);
});
})();
// Graceful shutdown
process.on("SIGTERM", async () => {
console.log("[crawler] Shutting down...");
if (browser) await browser.close().catch(() => {});
process.exit(0);
});