feat: major platform expansion — Brain service, RSS reader, iOS app, AI assistants, Firefox extension
Brain Service: - Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API) - AI classification with tag definitions and folder assignment - YouTube video download via yt-dlp - Karakeep migration complete (96 items) - Taxonomy management (folders with icons/colors, tags) - Discovery shuffle, sort options, search (Meilisearch + pgvector) - Item tag/folder editing, card color accents RSS Reader Service: - Custom FastAPI reader replacing Miniflux - Feed management (add/delete/refresh), category support - Full article extraction via Readability - Background content fetching for new entries - Mark all read with confirmation - Infinite scroll, retention cleanup (30/60 day) - 17 feeds migrated from Miniflux iOS App (SwiftUI): - Native iOS 17+ app with @Observable architecture - Cookie-based auth, configurable gateway URL - Dashboard with custom background photo + frosted glass widgets - Full fitness module (today/templates/goals/food library) - AI assistant chat (fitness + brain, raw JSON state management) - 120fps ProMotion support AI Assistants (Gateway): - Unified dispatcher with fitness/brain domain detection - Fitness: natural language food logging, photo analysis, multi-item splitting - Brain: save/append/update/delete notes, search & answer, undo support - Madiha user gets fitness-only (brain disabled) Firefox Extension: - One-click save to Brain from any page - Login with platform credentials - Right-click context menu (save page/link/image) - Notes field for URL saves - Signed and published on AMO Other: - Reader bookmark button routes to Brain (was Karakeep) - Fitness food library with "Add" button + add-to-meal popup - Kindle send file size check (25MB SMTP2GO limit) - Atelier UI as default (useAtelierShell=true) - Mobile upload box in nav drawer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,20 +9,61 @@ from app.config import OPENAI_API_KEY, OPENAI_MODEL
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
TAG_DEFINITIONS = {
|
||||
"home-assistant": "Home Assistant specific content (dashboards, ESPHome, automations, integrations, Lovelace cards)",
|
||||
"server": "Server/infrastructure content (Docker, backups, networking, self-hosted apps, Linux)",
|
||||
"kids": "Anything related to children, parenting, or educational content for kids",
|
||||
"shopping": "A product page, product review, or specific item you might want to buy (Amazon, stores, book reviews with purchase links). NOT general discussion threads or forums comparing many options.",
|
||||
"diy": "Physical hands-on projects around the house, yard, or vehicle — repairs, woodworking, crafts, building things. NOT software, dashboards, or digital projects.",
|
||||
"reference": "Lookup info like contacts, sizes, specs, measurements, settings to remember",
|
||||
"video": "Video content (YouTube, TikTok, etc)",
|
||||
"tutorial": "How-to guides, step-by-step instructions, learning content",
|
||||
"books": "Book recommendations, reviews, or reading lists",
|
||||
"travel": "Destinations, resorts, hotels, trip ideas, reviews, places to visit",
|
||||
"churning": "Credit card points, miles, award travel, hotel loyalty programs, points maximization, sign-up bonuses",
|
||||
"lawn-garden": "Lawn care, gardening, yard work, bug spraying, fertilizer, landscaping, plants, outdoor maintenance",
|
||||
"piracy": "Anything to do with downloading content like Audiobooks, games",
|
||||
"lectures": "Lecture notes, Islamic talks, sermon recordings, religious class notes",
|
||||
"3d-printing": "3D printer files (STL), printer mods, filament, slicer settings, 3D printed objects and projects",
|
||||
"work": "Work-related content",
|
||||
"vanlife": "Van conversion, Promaster van, van build projects, camping in vans, van electrical/solar, van life lifestyle",
|
||||
"yusuf": "Personal document belonging to family member Yusuf (look for name in title or content)",
|
||||
"madiha": "Personal document belonging to family member Madiha (look for name in title or content)",
|
||||
"hafsa": "Personal document belonging to family member Hafsa (look for name in title or content)",
|
||||
"mustafa": "Personal document belonging to family member Mustafa (look for name in title or content)",
|
||||
"medical": "Medical records, allergy results, prescriptions, lab work, vaccination records, doctor notes",
|
||||
"legal": "Birth certificates, passports, IDs, citizenship papers, contracts, legal agreements",
|
||||
"vehicle": "Car registration, license plates, insurance cards, vehicle titles, maintenance records",
|
||||
"insurance": "Insurance policies, insurance cards, coverage documents, claims",
|
||||
"financial": "Tax documents, bank statements, pay stubs, loan papers, credit reports",
|
||||
"homeschool": "Homeschooling resources, curriculum, lesson plans, educational materials for teaching kids at home, school projects, science experiments",
|
||||
}
|
||||
|
||||
|
||||
def build_system_prompt(folders: list[str], tags: list[str]) -> str:
|
||||
tag_defs = "\n".join(
|
||||
f" - '{t}': {TAG_DEFINITIONS[t]}" if t in TAG_DEFINITIONS else f" - '{t}'"
|
||||
for t in tags
|
||||
)
|
||||
return f"""You are a classification engine for a personal "second brain" knowledge management system.
|
||||
|
||||
Given an item (URL, note, document, or file), you must return structured JSON with:
|
||||
- folder: exactly 1 from this list: {json.dumps(folders)}
|
||||
- tags: exactly 2 or 3 from this list: {json.dumps(tags)}
|
||||
- title: a concise, normalized title (max 80 chars)
|
||||
- tags: ONLY from this predefined list. Do NOT create any new tags outside this list. If no tags fit, return an empty array.
|
||||
- title: a concise, normalized title in Title Case with spaces (max 80 chars, e.g. 'Machine Learning', 'Web Development')
|
||||
- summary: a 1-2 sentence summary of the content (for links/documents only)
|
||||
- corrected_text: for NOTES ONLY — return the original note text with spelling/grammar fixed. Keep the original meaning, tone, and structure. Only fix typos and obvious errors. Return empty string for non-notes.
|
||||
- confidence: a float 0.0-1.0 indicating how confident you are
|
||||
|
||||
Tag definitions (only assign tags that STRONGLY match the content):
|
||||
{tag_defs}
|
||||
|
||||
Rules:
|
||||
- NEVER invent folders or tags not in the lists above
|
||||
- Only assign tags that STRONGLY match the content. 1-2 tags is perfectly fine.
|
||||
- Do NOT pad with extra tags just to reach a target number. If only one tag fits, only use one.
|
||||
- If NO tags fit the content, return an empty tags array.
|
||||
- Name tags: 'yusuf', 'madiha', 'hafsa', or 'mustafa' ONLY when the content is a personal document belonging to that family member (look for their name in the title or content)
|
||||
- NEVER skip classification
|
||||
- NEVER return freeform text outside the schema
|
||||
- For notes: do NOT summarize. Keep the original text. Only fix spelling.
|
||||
@@ -43,7 +84,7 @@ def build_response_schema(folders: list[str], tags: list[str]) -> dict:
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "enum": tags},
|
||||
"minItems": 2,
|
||||
"minItems": 0,
|
||||
"maxItems": 3,
|
||||
},
|
||||
"title": {"type": "string"},
|
||||
@@ -88,8 +129,8 @@ async def classify_item(
|
||||
if not OPENAI_API_KEY:
|
||||
log.warning("No OPENAI_API_KEY set, returning defaults")
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"folder": "Home",
|
||||
"tags": ["reference"],
|
||||
"title": title or "Untitled",
|
||||
"summary": "No AI classification available",
|
||||
"confidence": 0.0,
|
||||
@@ -122,10 +163,8 @@ async def classify_item(
|
||||
|
||||
# Validate folder and tags are in allowed sets
|
||||
if result["folder"] not in folders:
|
||||
result["folder"] = folders[0] if folders else "Knowledge"
|
||||
result["folder"] = folders[0] if folders else "Home"
|
||||
result["tags"] = [t for t in result["tags"] if t in tags][:3]
|
||||
if len(result["tags"]) < 2:
|
||||
result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]
|
||||
|
||||
return result
|
||||
|
||||
@@ -133,8 +172,8 @@ async def classify_item(
|
||||
log.error(f"Classification attempt {attempt + 1} failed: {e}")
|
||||
if attempt == retries:
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"folder": "Home",
|
||||
"tags": ["reference"],
|
||||
"title": title or "Untitled",
|
||||
"summary": f"Classification failed: {e}",
|
||||
"confidence": 0.0,
|
||||
|
||||
@@ -1,162 +1,218 @@
|
||||
"""Content ingestion — fetch, extract, screenshot, archive."""
|
||||
"""Content ingestion — Playwright crawler for HTML, screenshots, og:image."""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from html.parser import HTMLParser
|
||||
from io import StringIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import BROWSERLESS_URL
|
||||
from app.config import CRAWLER_URL
|
||||
from app.services.storage import storage
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _HTMLTextExtractor(HTMLParser):
|
||||
"""Simple HTML to text converter."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._result = StringIO()
|
||||
self._skip = False
|
||||
self._skip_tags = {"script", "style", "noscript", "svg"}
|
||||
# ── YouTube helpers ──
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = False
|
||||
if tag in ("p", "div", "br", "h1", "h2", "h3", "h4", "li", "tr"):
|
||||
self._result.write("\n")
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self._skip:
|
||||
self._result.write(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
raw = self._result.getvalue()
|
||||
# Collapse whitespace
|
||||
lines = [line.strip() for line in raw.splitlines()]
|
||||
return "\n".join(line for line in lines if line)
|
||||
def _extract_youtube_id(url: str) -> str | None:
|
||||
patterns = [
|
||||
r'(?:youtube\.com/watch\?.*v=|youtu\.be/|youtube\.com/shorts/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
|
||||
]
|
||||
for pat in patterns:
|
||||
m = re.search(pat, url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def html_to_text(html: str) -> str:
|
||||
extractor = _HTMLTextExtractor()
|
||||
extractor.feed(html)
|
||||
return extractor.get_text()
|
||||
def _is_youtube_url(url: str) -> bool:
|
||||
return bool(_extract_youtube_id(url))
|
||||
|
||||
|
||||
def extract_title_from_html(html: str) -> str | None:
|
||||
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
||||
return match.group(1).strip() if match else None
|
||||
async def fetch_youtube_metadata(url: str) -> dict | None:
|
||||
"""Fetch YouTube video metadata via oEmbed. No API key needed."""
|
||||
video_id = _extract_youtube_id(url)
|
||||
if not video_id:
|
||||
return None
|
||||
|
||||
result = {
|
||||
"title": None,
|
||||
"description": None,
|
||||
"author": None,
|
||||
"thumbnail_url": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
||||
"video_id": video_id,
|
||||
"is_short": "/shorts/" in url,
|
||||
}
|
||||
|
||||
def extract_meta_description(html: str) -> str | None:
|
||||
match = re.search(
|
||||
r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
|
||||
html, re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
|
||||
async def fetch_url_content(url: str) -> dict:
|
||||
"""Fetch URL content. Returns dict with html, text, title, description, used_browserless."""
|
||||
result = {"html": None, "text": None, "title": None, "description": None, "used_browserless": False}
|
||||
|
||||
# Try HTTP-first extraction
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
||||
resp = await client.get(url, headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
|
||||
})
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
result["html"] = html
|
||||
result["text"] = html_to_text(html)
|
||||
result["title"] = extract_title_from_html(html)
|
||||
result["description"] = extract_meta_description(html)
|
||||
|
||||
# If extraction is weak (< 200 chars of text), try browserless
|
||||
if len(result["text"] or "") < 200:
|
||||
log.info(f"Weak extraction ({len(result['text'] or '')} chars), trying browserless")
|
||||
br = await fetch_with_browserless(url)
|
||||
if br and len(br.get("text", "")) > len(result["text"] or ""):
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
|
||||
resp = await client.get(oembed_url)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
result["title"] = data.get("title")
|
||||
result["author"] = data.get("author_name")
|
||||
|
||||
noembed_url = f"https://noembed.com/embed?url=https://www.youtube.com/watch?v={video_id}"
|
||||
resp2 = await client.get(noembed_url)
|
||||
if resp2.status_code == 200:
|
||||
data2 = resp2.json()
|
||||
if not result["title"]:
|
||||
result["title"] = data2.get("title")
|
||||
if not result["author"]:
|
||||
result["author"] = data2.get("author_name")
|
||||
except Exception as e:
|
||||
log.warning(f"HTTP fetch failed for {url}: {e}, trying browserless")
|
||||
try:
|
||||
br = await fetch_with_browserless(url)
|
||||
if br:
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
except Exception as e2:
|
||||
log.error(f"Browserless also failed for {url}: {e2}")
|
||||
log.warning(f"YouTube metadata fetch failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def fetch_with_browserless(url: str) -> dict | None:
|
||||
"""Use browserless/chrome to render JS-heavy pages."""
|
||||
async def download_youtube_thumbnail(url: str, item_id: str) -> str | None:
|
||||
"""Download YouTube thumbnail and save as screenshot asset."""
|
||||
video_id = _extract_youtube_id(url)
|
||||
if not video_id:
|
||||
return None
|
||||
|
||||
urls_to_try = [
|
||||
f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
||||
f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
|
||||
]
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/content",
|
||||
json={"url": url, "waitForTimeout": 3000},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
return {
|
||||
"html": html,
|
||||
"text": html_to_text(html),
|
||||
"title": extract_title_from_html(html),
|
||||
"description": extract_meta_description(html),
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
for thumb_url in urls_to_try:
|
||||
resp = await client.get(thumb_url)
|
||||
if resp.status_code == 200 and len(resp.content) > 1000:
|
||||
path = storage.save(
|
||||
item_id=item_id, asset_type="screenshot",
|
||||
filename="thumbnail.jpg", data=resp.content,
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Browserless fetch failed: {e}")
|
||||
log.warning(f"YouTube thumbnail download failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def take_screenshot(url: str, item_id: str) -> str | None:
|
||||
"""Take a screenshot of a URL using browserless. Returns storage path or None."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/screenshot",
|
||||
json={
|
||||
"url": url,
|
||||
"options": {"type": "png", "fullPage": False},
|
||||
"waitForTimeout": 3000,
|
||||
},
|
||||
async def download_youtube_video(url: str, item_id: str) -> tuple[str | None, dict]:
|
||||
"""Download YouTube video via yt-dlp."""
|
||||
import asyncio
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
video_id = _extract_youtube_id(url)
|
||||
if not video_id:
|
||||
return None, {}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
outpath = os.path.join(tmpdir, "%(id)s.%(ext)s")
|
||||
cmd = [
|
||||
"yt-dlp", "--no-playlist",
|
||||
"-f", "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best[height<=720]",
|
||||
"--merge-output-format", "mp4",
|
||||
"--write-info-json", "--no-write-playlist-metafiles",
|
||||
"-o", outpath, url,
|
||||
]
|
||||
try:
|
||||
proc = await asyncio.to_thread(
|
||||
subprocess.run, cmd, capture_output=True, text=True, timeout=120,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
log.warning(f"yt-dlp failed: {proc.stderr[:300]}")
|
||||
return None, {}
|
||||
|
||||
video_file = None
|
||||
info = {}
|
||||
for f in os.listdir(tmpdir):
|
||||
if f.endswith(".mp4"):
|
||||
video_file = os.path.join(tmpdir, f)
|
||||
elif f.endswith(".info.json"):
|
||||
import json as _json
|
||||
with open(os.path.join(tmpdir, f)) as fh:
|
||||
info = _json.load(fh)
|
||||
|
||||
if not video_file:
|
||||
return None, {}
|
||||
|
||||
file_data = open(video_file, "rb").read()
|
||||
path = storage.save(
|
||||
item_id=item_id, asset_type="video",
|
||||
filename=f"{video_id}.mp4", data=file_data,
|
||||
)
|
||||
log.info(f"Downloaded YouTube video: {len(file_data)} bytes -> {path}")
|
||||
return path, info
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning(f"yt-dlp timed out for {url}")
|
||||
return None, {}
|
||||
except Exception as e:
|
||||
log.error(f"YouTube download failed: {e}")
|
||||
return None, {}
|
||||
|
||||
|
||||
# ── Main crawler (Playwright stealth service) ──
|
||||
|
||||
async def crawl_url(url: str) -> dict:
|
||||
"""Call the Playwright crawler service. Returns dict with html, text, title,
|
||||
description, author, og_image_url, screenshot (base64), status_code, error."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=45) as client:
|
||||
resp = await client.post(f"{CRAWLER_URL}/crawl", json={"url": url})
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
log.warning(f"Crawler returned {resp.status_code} for {url}")
|
||||
except Exception as e:
|
||||
log.error(f"Crawler request failed for {url}: {e}")
|
||||
return {"url": url, "html": None, "text": None, "title": None,
|
||||
"description": None, "og_image_url": None, "screenshot": None, "error": str(e) if 'e' in dir() else "unknown"}
|
||||
|
||||
|
||||
async def save_screenshot_from_base64(b64: str, item_id: str) -> str | None:
|
||||
"""Decode base64 screenshot and save to storage."""
|
||||
try:
|
||||
data = base64.b64decode(b64)
|
||||
if len(data) < 500:
|
||||
return None
|
||||
path = storage.save(
|
||||
item_id=item_id, asset_type="screenshot",
|
||||
filename="screenshot.jpg", data=data,
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Screenshot save failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def download_og_image(og_url: str, item_id: str) -> str | None:
|
||||
"""Download an og:image and save as asset."""
|
||||
# Clean HTML entities from URL
|
||||
og_url = og_url.replace("&", "&")
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
||||
resp = await client.get(og_url, headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
|
||||
})
|
||||
if resp.status_code == 200 and len(resp.content) > 1000:
|
||||
ct = resp.headers.get("content-type", "image/jpeg")
|
||||
ext = "png" if "png" in ct else "jpg"
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
data=resp.content,
|
||||
item_id=item_id, asset_type="og_image",
|
||||
filename=f"og_image.{ext}", data=resp.content,
|
||||
)
|
||||
log.info(f"Downloaded og:image ({len(resp.content)} bytes) for {item_id}")
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Screenshot failed for {url}: {e}")
|
||||
log.warning(f"og:image download failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def archive_html(html: str, item_id: str) -> str | None:
|
||||
"""Save the full HTML as an archived asset."""
|
||||
"""Save full HTML as an archived asset."""
|
||||
if not html:
|
||||
return None
|
||||
try:
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="archived_html",
|
||||
filename="page.html",
|
||||
data=html.encode("utf-8"),
|
||||
item_id=item_id, asset_type="archived_html",
|
||||
filename="page.html", data=html.encode("utf-8"),
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user