Brain Service: - Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API) - AI classification with tag definitions and folder assignment - YouTube video download via yt-dlp - Karakeep migration complete (96 items) - Taxonomy management (folders with icons/colors, tags) - Discovery shuffle, sort options, search (Meilisearch + pgvector) - Item tag/folder editing, card color accents RSS Reader Service: - Custom FastAPI reader replacing Miniflux - Feed management (add/delete/refresh), category support - Full article extraction via Readability - Background content fetching for new entries - Mark all read with confirmation - Infinite scroll, retention cleanup (30/60 day) - 17 feeds migrated from Miniflux iOS App (SwiftUI): - Native iOS 17+ app with @Observable architecture - Cookie-based auth, configurable gateway URL - Dashboard with custom background photo + frosted glass widgets - Full fitness module (today/templates/goals/food library) - AI assistant chat (fitness + brain, raw JSON state management) - 120fps ProMotion support AI Assistants (Gateway): - Unified dispatcher with fitness/brain domain detection - Fitness: natural language food logging, photo analysis, multi-item splitting - Brain: save/append/update/delete notes, search & answer, undo support - Madiha user gets fitness-only (brain disabled) Firefox Extension: - One-click save to Brain from any page - Login with platform credentials - Right-click context menu (save page/link/image) - Notes field for URL saves - Signed and published on AMO Other: - Reader bookmark button routes to Brain (was Karakeep) - Fitness food library with "Add" button + add-to-meal popup - Kindle send file size check (25MB SMTP2GO limit) - Atelier UI as default (useAtelierShell=true) - Mobile upload box in nav drawer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
371 lines
11 KiB
JavaScript
371 lines
11 KiB
JavaScript
import http from "node:http";
|
|
import { chromium } from "playwright-extra";
|
|
import StealthPlugin from "puppeteer-extra-plugin-stealth";
|
|
import { Readability } from "@mozilla/readability";
|
|
import { JSDOM } from "jsdom";
|
|
|
|
chromium.use(StealthPlugin());
|
|
|
|
const PORT = parseInt(process.env.PORT || "3100");
|
|
const VIEWPORT = { width: 1440, height: 900 };
|
|
const USER_AGENT =
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
const NAV_TIMEOUT = 30_000;
|
|
const SCREENSHOT_TIMEOUT = 8_000;
|
|
|
|
let browser = null;
|
|
|
|
async function ensureBrowser() {
|
|
if (browser && browser.isConnected()) return browser;
|
|
if (browser) {
|
|
try { await browser.close(); } catch {}
|
|
browser = null;
|
|
}
|
|
console.log("[crawler] Launching browser...");
|
|
browser = await chromium.launch({
|
|
headless: true,
|
|
args: [
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--disable-gpu",
|
|
],
|
|
});
|
|
console.log("[crawler] Browser ready");
|
|
return browser;
|
|
}
|
|
|
|
// Extract og:image and other meta from rendered HTML
|
|
function extractMeta(html) {
|
|
const meta = {};
|
|
|
|
const patterns = {
|
|
og_image: [
|
|
/(?:property|name)=["']og:image["'][^>]*content=["']([^"']+)["']/i,
|
|
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:image["']/i,
|
|
],
|
|
title: [
|
|
/(?:property|name)=["']og:title["'][^>]*content=["']([^"']+)["']/i,
|
|
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:title["']/i,
|
|
/<title[^>]*>([^<]+)<\/title>/i,
|
|
],
|
|
description: [
|
|
/(?:property|name)=["']og:description["'][^>]*content=["']([^"']+)["']/i,
|
|
/name=["']description["'][^>]*content=["']([^"']+)["']/i,
|
|
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:description["']/i,
|
|
],
|
|
author: [
|
|
/name=["']author["'][^>]*content=["']([^"']+)["']/i,
|
|
/property=["']article:author["'][^>]*content=["']([^"']+)["']/i,
|
|
],
|
|
favicon: [
|
|
/rel=["']icon["'][^>]*href=["']([^"']+)["']/i,
|
|
/rel=["']shortcut icon["'][^>]*href=["']([^"']+)["']/i,
|
|
],
|
|
};
|
|
|
|
for (const [key, pats] of Object.entries(patterns)) {
|
|
for (const pat of pats) {
|
|
const m = html.match(pat);
|
|
if (m) {
|
|
meta[key] = m[1].trim();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return meta;
|
|
}
|
|
|
|
function isRedditUrl(url) {
|
|
try {
|
|
const h = new URL(url).hostname;
|
|
return h === "www.reddit.com" || h === "reddit.com";
|
|
} catch {}
|
|
return false;
|
|
}
|
|
|
|
async function resolveRedditShortUrl(url) {
|
|
// Reddit short URLs (/r/sub/s/xxx) redirect to the actual post
|
|
if (/\/s\/[a-zA-Z0-9]+/.test(url)) {
|
|
try {
|
|
const resp = await fetch(url, {
|
|
method: "HEAD",
|
|
redirect: "follow",
|
|
headers: { "User-Agent": "SecondBrain/1.0" },
|
|
});
|
|
const resolved = resp.url;
|
|
if (resolved && resolved.includes("/comments/")) {
|
|
console.log(`[crawler] Reddit short URL resolved: ${url} -> ${resolved}`);
|
|
return resolved;
|
|
}
|
|
} catch (e) {
|
|
console.warn("[crawler] Reddit short URL resolve failed:", e.message);
|
|
}
|
|
}
|
|
return url;
|
|
}
|
|
|
|
async function fetchRedditJson(url) {
|
|
// Resolve short URLs first
|
|
url = await resolveRedditShortUrl(url);
|
|
|
|
// Reddit JSON API — append .json to get structured data
|
|
try {
|
|
const jsonUrl = url.replace(/\/?(\?.*)?$/, "/.json$1");
|
|
const resp = await fetch(jsonUrl, {
|
|
headers: { "User-Agent": "SecondBrain/1.0" },
|
|
redirect: "follow",
|
|
});
|
|
if (!resp.ok) return null;
|
|
const data = await resp.json();
|
|
const post = data?.[0]?.data?.children?.[0]?.data;
|
|
if (!post) return null;
|
|
|
|
const previewImg = (post.preview?.images?.[0]?.source?.url || "").replace(/&/g, "&") || null;
|
|
const thumbnail = post.thumbnail?.startsWith("http") ? post.thumbnail : null;
|
|
|
|
// If no preview image, try to get subreddit icon
|
|
let ogImage = previewImg || thumbnail || null;
|
|
if (!ogImage && post.subreddit) {
|
|
try {
|
|
const aboutResp = await fetch(
|
|
`https://www.reddit.com/r/${post.subreddit}/about.json`,
|
|
{ headers: { "User-Agent": "SecondBrain/1.0" } }
|
|
);
|
|
if (aboutResp.ok) {
|
|
const about = await aboutResp.json();
|
|
const icon = about?.data?.community_icon?.replace(/&/g, "&")?.split("?")?.[0]
|
|
|| about?.data?.icon_img
|
|
|| about?.data?.header_img;
|
|
if (icon && icon.startsWith("http")) {
|
|
ogImage = icon;
|
|
}
|
|
}
|
|
} catch {}
|
|
}
|
|
|
|
return {
|
|
url,
|
|
html: null,
|
|
text: `${post.title || ""}\n\n${post.selftext || ""}`.trim(),
|
|
title: post.title || null,
|
|
description: (post.selftext || "").slice(0, 200) || null,
|
|
author: post.author ? `u/${post.author}` : null,
|
|
og_image_url: ogImage ? ogImage.replace(/&/g, "&") : null,
|
|
favicon: null,
|
|
screenshot: null,
|
|
status_code: 200,
|
|
error: null,
|
|
subreddit: post.subreddit_name_prefixed || null,
|
|
};
|
|
} catch (e) {
|
|
console.warn("[crawler] Reddit JSON failed:", e.message);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function crawl(url) {
|
|
// Reddit: use JSON API (avoids login walls entirely)
|
|
if (isRedditUrl(url)) {
|
|
const redditData = await fetchRedditJson(url);
|
|
if (redditData) {
|
|
console.log(`[crawler] Reddit JSON OK: ${url} (og=${!!redditData.og_image_url})`);
|
|
return redditData;
|
|
}
|
|
console.log(`[crawler] Reddit JSON failed, falling back to browser: ${url}`);
|
|
}
|
|
|
|
const crawlUrl = url;
|
|
let b;
|
|
try {
|
|
b = await ensureBrowser();
|
|
} catch (e) {
|
|
console.error("[crawler] Browser launch failed, retrying:", e.message);
|
|
browser = null;
|
|
b = await ensureBrowser();
|
|
}
|
|
const contextOpts = {
|
|
viewport: VIEWPORT,
|
|
userAgent: USER_AGENT,
|
|
ignoreHTTPSErrors: true,
|
|
};
|
|
|
|
// Reddit: set cookies to bypass login walls
|
|
if (isRedditUrl(url)) {
|
|
contextOpts.extraHTTPHeaders = {
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
};
|
|
}
|
|
|
|
const context = await b.newContext(contextOpts);
|
|
|
|
const page = await context.newPage();
|
|
const result = {
|
|
url,
|
|
html: null,
|
|
text: null,
|
|
readable_html: null,
|
|
title: null,
|
|
description: null,
|
|
author: null,
|
|
og_image_url: null,
|
|
favicon: null,
|
|
screenshot: null, // base64
|
|
status_code: null,
|
|
error: null,
|
|
};
|
|
|
|
try {
|
|
// Navigate (use normalized URL to avoid login walls)
|
|
const response = await page.goto(crawlUrl, {
|
|
waitUntil: "domcontentloaded",
|
|
timeout: NAV_TIMEOUT,
|
|
});
|
|
result.status_code = response?.status() || null;
|
|
|
|
// Wait for network to settle (up to 5s)
|
|
try {
|
|
await page.waitForLoadState("networkidle", { timeout: 5000 });
|
|
} catch {
|
|
// networkidle timeout is fine, page is probably loaded enough
|
|
}
|
|
|
|
// Reddit: dismiss login modals and overlays
|
|
if (isRedditUrl(url)) {
|
|
await page.evaluate(() => {
|
|
// Remove login modal/overlay
|
|
document.querySelectorAll('shreddit-overlay-display, [id*="login"], .overlay-container, reddit-cookie-banner').forEach(el => el.remove());
|
|
// Remove any body scroll locks
|
|
document.body.style.overflow = 'auto';
|
|
document.documentElement.style.overflow = 'auto';
|
|
}).catch(() => {});
|
|
await page.waitForTimeout(1000);
|
|
}
|
|
|
|
// Get rendered HTML + screenshot in parallel
|
|
const [html, screenshot] = await Promise.all([
|
|
page.content(),
|
|
page
|
|
.screenshot({ type: "jpeg", quality: 80, fullPage: false })
|
|
.catch((e) => {
|
|
console.warn("[crawler] Screenshot failed:", e.message);
|
|
return null;
|
|
}),
|
|
]);
|
|
|
|
result.html = html;
|
|
|
|
// Extract text from page
|
|
result.text = await page
|
|
.evaluate(() => {
|
|
const el =
|
|
document.querySelector("article") ||
|
|
document.querySelector("main") ||
|
|
document.querySelector('[role="main"]') ||
|
|
document.body;
|
|
return el ? el.innerText.slice(0, 10000) : "";
|
|
})
|
|
.catch(() => "");
|
|
|
|
// Extract readable article HTML via Mozilla Readability
|
|
try {
|
|
const dom = new JSDOM(html, { url: crawlUrl });
|
|
const reader = new Readability(dom.window.document);
|
|
const article = reader.parse();
|
|
if (article && article.content) {
|
|
result.readable_html = article.content;
|
|
if (article.textContent) {
|
|
result.text = article.textContent.slice(0, 10000);
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.warn("[crawler] Readability failed:", e.message);
|
|
}
|
|
|
|
// Extract meta from rendered DOM
|
|
const meta = extractMeta(html);
|
|
result.title = meta.title || (await page.title()) || null;
|
|
result.description = meta.description || null;
|
|
result.author = meta.author || null;
|
|
result.og_image_url = meta.og_image || null;
|
|
result.favicon = meta.favicon || null;
|
|
|
|
// Screenshot as base64
|
|
if (screenshot) {
|
|
result.screenshot = screenshot.toString("base64");
|
|
}
|
|
} catch (e) {
|
|
result.error = e.message;
|
|
console.error("[crawler] Crawl error:", url, e.message);
|
|
// If browser crashed, reset it for next request
|
|
if (e.message.includes("closed") || e.message.includes("crashed")) {
|
|
browser = null;
|
|
}
|
|
} finally {
|
|
await page.close().catch(() => {});
|
|
await context.close().catch(() => {});
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// Simple HTTP server
|
|
const server = http.createServer(async (req, res) => {
|
|
// Health check
|
|
if (req.method === "GET" && req.url === "/health") {
|
|
res.writeHead(200, { "Content-Type": "application/json" });
|
|
res.end(JSON.stringify({ status: "ok" }));
|
|
return;
|
|
}
|
|
|
|
// Crawl endpoint
|
|
if (req.method === "POST" && req.url === "/crawl") {
|
|
let body = "";
|
|
req.on("data", (chunk) => (body += chunk));
|
|
req.on("end", async () => {
|
|
try {
|
|
const { url } = JSON.parse(body);
|
|
if (!url) {
|
|
res.writeHead(400, { "Content-Type": "application/json" });
|
|
res.end(JSON.stringify({ error: "url is required" }));
|
|
return;
|
|
}
|
|
|
|
console.log(`[crawler] Crawling: ${url}`);
|
|
const result = await crawl(url);
|
|
console.log(
|
|
`[crawler] Done: ${url} (status=${result.status_code}, og=${!!result.og_image_url}, ss=${!!result.screenshot})`
|
|
);
|
|
|
|
res.writeHead(200, { "Content-Type": "application/json" });
|
|
res.end(JSON.stringify(result));
|
|
} catch (e) {
|
|
console.error("[crawler] Request error:", e);
|
|
res.writeHead(500, { "Content-Type": "application/json" });
|
|
res.end(JSON.stringify({ error: e.message }));
|
|
}
|
|
});
|
|
return;
|
|
}
|
|
|
|
res.writeHead(404);
|
|
res.end("Not found");
|
|
});
|
|
|
|
// Startup
|
|
(async () => {
|
|
await ensureBrowser();
|
|
server.listen(PORT, () => {
|
|
console.log(`[crawler] Listening on :${PORT}`);
|
|
});
|
|
})();
|
|
|
|
// Graceful shutdown
|
|
process.on("SIGTERM", async () => {
|
|
console.log("[crawler] Shutting down...");
|
|
if (browser) await browser.close().catch(() => {});
|
|
process.exit(0);
|
|
});
|