feat: major platform expansion — Brain service, RSS reader, iOS app, AI assistants, Firefox extension
Brain Service: - Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API) - AI classification with tag definitions and folder assignment - YouTube video download via yt-dlp - Karakeep migration complete (96 items) - Taxonomy management (folders with icons/colors, tags) - Discovery shuffle, sort options, search (Meilisearch + pgvector) - Item tag/folder editing, card color accents RSS Reader Service: - Custom FastAPI reader replacing Miniflux - Feed management (add/delete/refresh), category support - Full article extraction via Readability - Background content fetching for new entries - Mark all read with confirmation - Infinite scroll, retention cleanup (30/60 day) - 17 feeds migrated from Miniflux iOS App (SwiftUI): - Native iOS 17+ app with @Observable architecture - Cookie-based auth, configurable gateway URL - Dashboard with custom background photo + frosted glass widgets - Full fitness module (today/templates/goals/food library) - AI assistant chat (fitness + brain, raw JSON state management) - 120fps ProMotion support AI Assistants (Gateway): - Unified dispatcher with fitness/brain domain detection - Fitness: natural language food logging, photo analysis, multi-item splitting - Brain: save/append/update/delete notes, search & answer, undo support - Madiha user gets fitness-only (brain disabled) Firefox Extension: - One-click save to Brain from any page - Login with platform credentials - Right-click context menu (save page/link/image) - Notes field for URL saves - Signed and published on AMO Other: - Reader bookmark button routes to Brain (was Karakeep) - Fitness food library with "Add" button + add-to-meal popup - Kindle send file size check (25MB SMTP2GO limit) - Atelier UI as default (useAtelierShell=true) - Mobile upload box in nav drawer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,8 +2,8 @@ FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev tesseract-ocr tesseract-ocr-eng && rm -rf /var/lib/apt/lists/*
|
||||
RUN pip install --no-cache-dir --upgrade pip
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev tesseract-ocr tesseract-ocr-eng ffmpeg && rm -rf /var/lib/apt/lists/*
|
||||
RUN pip install --no-cache-dir --upgrade pip yt-dlp
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
@@ -13,10 +13,10 @@ from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.api.deps import get_user_id, get_db_session
|
||||
from app.config import FOLDERS, TAGS
|
||||
from app.models.item import Item, ItemAsset
|
||||
from app.models.item import Item, ItemAsset, ItemAddition
|
||||
from app.models.schema import (
|
||||
ItemCreate, ItemUpdate, ItemOut, ItemList, SearchQuery, SemanticSearchQuery,
|
||||
HybridSearchQuery, SearchResult, ConfigOut,
|
||||
HybridSearchQuery, SearchResult, ConfigOut, ItemAdditionCreate, ItemAdditionOut,
|
||||
)
|
||||
from app.services.storage import storage
|
||||
from fastapi.responses import Response
|
||||
@@ -25,6 +25,46 @@ from app.worker.tasks import enqueue_process_item
|
||||
router = APIRouter(prefix="/api", tags=["brain"])
|
||||
|
||||
|
||||
async def refresh_item_search_state(db: AsyncSession, item: Item):
|
||||
"""Recompute embedding + Meilisearch doc after assistant additions change."""
|
||||
from app.search.engine import index_item
|
||||
from app.services.embed import generate_embedding
|
||||
|
||||
additions_result = await db.execute(
|
||||
select(ItemAddition)
|
||||
.where(ItemAddition.item_id == item.id, ItemAddition.user_id == item.user_id)
|
||||
.order_by(ItemAddition.created_at.asc())
|
||||
)
|
||||
additions = additions_result.scalars().all()
|
||||
additions_text = "\n\n".join(addition.content for addition in additions if addition.content.strip())
|
||||
|
||||
searchable_text_parts = [item.raw_content or "", item.extracted_text or "", additions_text]
|
||||
searchable_text = "\n\n".join(part.strip() for part in searchable_text_parts if part and part.strip())
|
||||
|
||||
embed_text = f"{item.title or ''}\n{item.summary or ''}\n{searchable_text}".strip()
|
||||
embedding = await generate_embedding(embed_text)
|
||||
if embedding:
|
||||
item.embedding = embedding
|
||||
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
await db.refresh(item)
|
||||
|
||||
await index_item({
|
||||
"id": item.id,
|
||||
"user_id": item.user_id,
|
||||
"type": item.type,
|
||||
"title": item.title,
|
||||
"url": item.url,
|
||||
"folder": item.folder,
|
||||
"tags": item.tags or [],
|
||||
"summary": item.summary,
|
||||
"extracted_text": searchable_text[:10000],
|
||||
"processing_status": item.processing_status,
|
||||
"created_at": item.created_at.isoformat() if item.created_at else None,
|
||||
})
|
||||
|
||||
|
||||
# ── Health ──
|
||||
|
||||
@router.get("/health")
|
||||
@@ -201,14 +241,31 @@ async def update_item(
|
||||
item.title = body.title
|
||||
if body.folder is not None:
|
||||
item.folder = body.folder
|
||||
# Update folder_id FK
|
||||
from app.models.taxonomy import Folder as FolderModel
|
||||
folder_row = (await db.execute(
|
||||
select(FolderModel).where(FolderModel.user_id == user_id, FolderModel.name == body.folder)
|
||||
)).scalar_one_or_none()
|
||||
item.folder_id = folder_row.id if folder_row else None
|
||||
if body.tags is not None:
|
||||
item.tags = body.tags
|
||||
# Update item_tags relational entries
|
||||
from app.models.taxonomy import Tag as TagModel, ItemTag
|
||||
from sqlalchemy import delete as sa_delete
|
||||
await db.execute(sa_delete(ItemTag).where(ItemTag.item_id == item.id))
|
||||
for tag_name in body.tags:
|
||||
tag_row = (await db.execute(
|
||||
select(TagModel).where(TagModel.user_id == user_id, TagModel.name == tag_name)
|
||||
)).scalar_one_or_none()
|
||||
if tag_row:
|
||||
db.add(ItemTag(item_id=item.id, tag_id=tag_row.id))
|
||||
if body.raw_content is not None:
|
||||
item.raw_content = body.raw_content
|
||||
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
await db.refresh(item)
|
||||
await refresh_item_search_state(db, item)
|
||||
return item
|
||||
|
||||
|
||||
@@ -238,6 +295,100 @@ async def delete_item(
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@router.get("/items/{item_id}/additions", response_model=list[ItemAdditionOut])
|
||||
async def list_item_additions(
|
||||
item_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
item = (await db.execute(
|
||||
select(Item).where(Item.id == item_id, Item.user_id == user_id)
|
||||
)).scalar_one_or_none()
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
|
||||
additions = (await db.execute(
|
||||
select(ItemAddition)
|
||||
.where(ItemAddition.item_id == item_id, ItemAddition.user_id == user_id)
|
||||
.order_by(ItemAddition.created_at.asc())
|
||||
)).scalars().all()
|
||||
return additions
|
||||
|
||||
|
||||
@router.post("/items/{item_id}/additions", response_model=ItemAdditionOut, status_code=201)
|
||||
async def create_item_addition(
|
||||
item_id: str,
|
||||
body: ItemAdditionCreate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
item = (await db.execute(
|
||||
select(Item).where(Item.id == item_id, Item.user_id == user_id)
|
||||
)).scalar_one_or_none()
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
|
||||
content = body.content.strip()
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="Addition content cannot be empty")
|
||||
|
||||
addition = ItemAddition(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
user_id=user_id,
|
||||
source=(body.source or "assistant").strip() or "assistant",
|
||||
kind=(body.kind or "append").strip() or "append",
|
||||
content=content,
|
||||
metadata_json=body.metadata_json or {},
|
||||
)
|
||||
db.add(addition)
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
await db.refresh(addition)
|
||||
|
||||
result = await db.execute(
|
||||
select(Item).where(Item.id == item.id, Item.user_id == user_id)
|
||||
)
|
||||
fresh_item = result.scalar_one()
|
||||
await refresh_item_search_state(db, fresh_item)
|
||||
return addition
|
||||
|
||||
|
||||
@router.delete("/items/{item_id}/additions/{addition_id}")
|
||||
async def delete_item_addition(
|
||||
item_id: str,
|
||||
addition_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
item = (await db.execute(
|
||||
select(Item).where(Item.id == item_id, Item.user_id == user_id)
|
||||
)).scalar_one_or_none()
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
|
||||
addition = (await db.execute(
|
||||
select(ItemAddition).where(
|
||||
ItemAddition.id == addition_id,
|
||||
ItemAddition.item_id == item_id,
|
||||
ItemAddition.user_id == user_id,
|
||||
)
|
||||
)).scalar_one_or_none()
|
||||
if not addition:
|
||||
raise HTTPException(status_code=404, detail="Addition not found")
|
||||
|
||||
await db.delete(addition)
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
|
||||
result = await db.execute(
|
||||
select(Item).where(Item.id == item.id, Item.user_id == user_id)
|
||||
)
|
||||
fresh_item = result.scalar_one()
|
||||
await refresh_item_search_state(db, fresh_item)
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
# ── Reprocess item ──
|
||||
|
||||
@router.post("/items/{item_id}/reprocess", response_model=ItemOut)
|
||||
@@ -335,5 +486,7 @@ async def serve_asset(item_id: str, asset_type: str, filename: str):
|
||||
elif filename.endswith(".jpg") or filename.endswith(".jpeg"): ct = "image/jpeg"
|
||||
elif filename.endswith(".html"): ct = "text/html"
|
||||
elif filename.endswith(".pdf"): ct = "application/pdf"
|
||||
elif filename.endswith(".mp4"): ct = "video/mp4"
|
||||
elif filename.endswith(".webm"): ct = "video/webm"
|
||||
|
||||
return Response(content=data, media_type=ct, headers={"Cache-Control": "public, max-age=3600"})
|
||||
|
||||
@@ -17,8 +17,8 @@ MEILI_URL = os.environ.get("MEILI_URL", "http://brain-meili:7700")
|
||||
MEILI_KEY = os.environ.get("MEILI_MASTER_KEY", "brain-meili-key")
|
||||
MEILI_INDEX = "items"
|
||||
|
||||
# ── Browserless ──
|
||||
BROWSERLESS_URL = os.environ.get("BROWSERLESS_URL", "http://brain-browserless:3000")
|
||||
# ── Crawler ──
|
||||
CRAWLER_URL = os.environ.get("CRAWLER_URL", "http://brain-crawler:3100")
|
||||
|
||||
# ── OpenAI ──
|
||||
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
||||
@@ -42,14 +42,14 @@ DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true")
|
||||
|
||||
# ── Classification rules ──
|
||||
FOLDERS = [
|
||||
"Home", "Family", "Work", "Travel", "Knowledge", "Faith", "Projects"
|
||||
"Home", "Family", "Work", "Travel", "Islam",
|
||||
"Homelab", "Vanlife", "3D Printing", "Documents",
|
||||
]
|
||||
|
||||
TAGS = [
|
||||
"reference", "important", "legal", "financial", "insurance",
|
||||
"research", "idea", "guide", "tutorial", "setup", "how-to",
|
||||
"tools", "dev", "server", "selfhosted", "home-assistant",
|
||||
"shopping", "compare", "buy", "product",
|
||||
"family", "kids", "health", "travel", "faith",
|
||||
"video", "read-later", "books",
|
||||
"diy", "reference", "home-assistant", "shopping", "video",
|
||||
"tutorial", "server", "kids", "books", "travel",
|
||||
"churning", "lawn-garden", "piracy", "work", "3d-printing",
|
||||
"lectures", "vanlife", "yusuf", "madiha", "hafsa", "mustafa",
|
||||
"medical", "legal", "vehicle", "insurance", "financial", "homeschool",
|
||||
]
|
||||
|
||||
@@ -31,7 +31,7 @@ app.include_router(taxonomy_router)
|
||||
async def startup():
|
||||
from sqlalchemy import text as sa_text
|
||||
from app.database import engine, Base
|
||||
from app.models.item import Item, ItemAsset, AppLink # noqa: import to register models
|
||||
from app.models.item import Item, ItemAsset, AppLink, ItemAddition # noqa: import to register models
|
||||
from app.models.taxonomy import Folder, Tag, ItemTag # noqa: register taxonomy tables
|
||||
|
||||
# Enable pgvector extension before creating tables
|
||||
|
||||
@@ -45,6 +45,12 @@ class Item(Base):
|
||||
|
||||
# Relationships
|
||||
assets = relationship("ItemAsset", back_populates="item", cascade="all, delete-orphan")
|
||||
additions = relationship(
|
||||
"ItemAddition",
|
||||
back_populates="item",
|
||||
cascade="all, delete-orphan",
|
||||
order_by="ItemAddition.created_at",
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_items_user_status", "user_id", "processing_status"),
|
||||
@@ -79,3 +85,19 @@ class AppLink(Base):
|
||||
app = Column(String(64), nullable=False) # trips|tasks|fitness|inventory
|
||||
app_entity_id = Column(String(128), nullable=False)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
|
||||
|
||||
class ItemAddition(Base):
|
||||
__tablename__ = "item_additions"
|
||||
|
||||
id = Column(UUID(as_uuid=False), primary_key=True, default=new_id)
|
||||
item_id = Column(UUID(as_uuid=False), ForeignKey("items.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
user_id = Column(String(64), nullable=False, index=True)
|
||||
source = Column(String(32), nullable=False, default="assistant") # assistant|manual
|
||||
kind = Column(String(32), nullable=False, default="append") # append
|
||||
content = Column(Text, nullable=False)
|
||||
metadata_json = Column(JSONB, nullable=True, default=dict)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||
|
||||
item = relationship("Item", back_populates="additions")
|
||||
|
||||
@@ -26,6 +26,13 @@ class ItemUpdate(BaseModel):
|
||||
raw_content: Optional[str] = None
|
||||
|
||||
|
||||
class ItemAdditionCreate(BaseModel):
|
||||
content: str
|
||||
source: Optional[str] = "assistant"
|
||||
kind: Optional[str] = "append"
|
||||
metadata_json: Optional[dict] = None
|
||||
|
||||
|
||||
class SearchQuery(BaseModel):
|
||||
q: str
|
||||
folder: Optional[str] = None
|
||||
@@ -63,6 +70,19 @@ class AssetOut(BaseModel):
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class ItemAdditionOut(BaseModel):
|
||||
id: str
|
||||
item_id: str
|
||||
source: str
|
||||
kind: str
|
||||
content: str
|
||||
metadata_json: Optional[dict] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class ItemOut(BaseModel):
|
||||
id: str
|
||||
type: str
|
||||
|
||||
@@ -70,23 +70,24 @@ class ItemTag(Base):
|
||||
|
||||
# Default folders with colors and icons
|
||||
DEFAULT_FOLDERS = [
|
||||
{"name": "Home", "color": "#059669", "icon": "home"},
|
||||
{"name": "Family", "color": "#D97706", "icon": "heart"},
|
||||
{"name": "Work", "color": "#4338CA", "icon": "briefcase"},
|
||||
{"name": "Travel", "color": "#0EA5E9", "icon": "plane"},
|
||||
{"name": "Knowledge", "color": "#8B5CF6", "icon": "book-open"},
|
||||
{"name": "Faith", "color": "#10B981", "icon": "moon"},
|
||||
{"name": "Projects", "color": "#F43F5E", "icon": "folder"},
|
||||
{"name": "Home", "color": "#059669", "icon": "home"},
|
||||
{"name": "Family", "color": "#D97706", "icon": "heart"},
|
||||
{"name": "Work", "color": "#4338CA", "icon": "briefcase"},
|
||||
{"name": "Travel", "color": "#0EA5E9", "icon": "plane"},
|
||||
{"name": "Islam", "color": "#10B981", "icon": "moon"},
|
||||
{"name": "Homelab", "color": "#6366F1", "icon": "server"},
|
||||
{"name": "Vanlife", "color": "#F59E0B", "icon": "truck"},
|
||||
{"name": "3D Printing", "color": "#EC4899", "icon": "printer"},
|
||||
{"name": "Documents", "color": "#78716C", "icon": "file-text"},
|
||||
]
|
||||
|
||||
# Default tags to seed for new users
|
||||
DEFAULT_TAGS = [
|
||||
"reference", "important", "legal", "financial", "insurance",
|
||||
"research", "idea", "guide", "tutorial", "setup", "how-to",
|
||||
"tools", "dev", "server", "selfhosted", "home-assistant",
|
||||
"shopping", "compare", "buy", "product",
|
||||
"family", "kids", "health", "travel", "faith",
|
||||
"video", "read-later", "books",
|
||||
"diy", "reference", "home-assistant", "shopping", "video",
|
||||
"tutorial", "server", "kids", "books", "travel",
|
||||
"churning", "lawn-garden", "piracy", "work", "3d-printing",
|
||||
"lectures", "vanlife", "yusuf", "madiha", "hafsa", "mustafa",
|
||||
"medical", "legal", "vehicle", "insurance", "financial", "homeschool",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -9,20 +9,61 @@ from app.config import OPENAI_API_KEY, OPENAI_MODEL
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
TAG_DEFINITIONS = {
|
||||
"home-assistant": "Home Assistant specific content (dashboards, ESPHome, automations, integrations, Lovelace cards)",
|
||||
"server": "Server/infrastructure content (Docker, backups, networking, self-hosted apps, Linux)",
|
||||
"kids": "Anything related to children, parenting, or educational content for kids",
|
||||
"shopping": "A product page, product review, or specific item you might want to buy (Amazon, stores, book reviews with purchase links). NOT general discussion threads or forums comparing many options.",
|
||||
"diy": "Physical hands-on projects around the house, yard, or vehicle — repairs, woodworking, crafts, building things. NOT software, dashboards, or digital projects.",
|
||||
"reference": "Lookup info like contacts, sizes, specs, measurements, settings to remember",
|
||||
"video": "Video content (YouTube, TikTok, etc)",
|
||||
"tutorial": "How-to guides, step-by-step instructions, learning content",
|
||||
"books": "Book recommendations, reviews, or reading lists",
|
||||
"travel": "Destinations, resorts, hotels, trip ideas, reviews, places to visit",
|
||||
"churning": "Credit card points, miles, award travel, hotel loyalty programs, points maximization, sign-up bonuses",
|
||||
"lawn-garden": "Lawn care, gardening, yard work, bug spraying, fertilizer, landscaping, plants, outdoor maintenance",
|
||||
"piracy": "Anything to do with downloading content like Audiobooks, games",
|
||||
"lectures": "Lecture notes, Islamic talks, sermon recordings, religious class notes",
|
||||
"3d-printing": "3D printer files (STL), printer mods, filament, slicer settings, 3D printed objects and projects",
|
||||
"work": "Work-related content",
|
||||
"vanlife": "Van conversion, Promaster van, van build projects, camping in vans, van electrical/solar, van life lifestyle",
|
||||
"yusuf": "Personal document belonging to family member Yusuf (look for name in title or content)",
|
||||
"madiha": "Personal document belonging to family member Madiha (look for name in title or content)",
|
||||
"hafsa": "Personal document belonging to family member Hafsa (look for name in title or content)",
|
||||
"mustafa": "Personal document belonging to family member Mustafa (look for name in title or content)",
|
||||
"medical": "Medical records, allergy results, prescriptions, lab work, vaccination records, doctor notes",
|
||||
"legal": "Birth certificates, passports, IDs, citizenship papers, contracts, legal agreements",
|
||||
"vehicle": "Car registration, license plates, insurance cards, vehicle titles, maintenance records",
|
||||
"insurance": "Insurance policies, insurance cards, coverage documents, claims",
|
||||
"financial": "Tax documents, bank statements, pay stubs, loan papers, credit reports",
|
||||
"homeschool": "Homeschooling resources, curriculum, lesson plans, educational materials for teaching kids at home, school projects, science experiments",
|
||||
}
|
||||
|
||||
|
||||
def build_system_prompt(folders: list[str], tags: list[str]) -> str:
|
||||
tag_defs = "\n".join(
|
||||
f" - '{t}': {TAG_DEFINITIONS[t]}" if t in TAG_DEFINITIONS else f" - '{t}'"
|
||||
for t in tags
|
||||
)
|
||||
return f"""You are a classification engine for a personal "second brain" knowledge management system.
|
||||
|
||||
Given an item (URL, note, document, or file), you must return structured JSON with:
|
||||
- folder: exactly 1 from this list: {json.dumps(folders)}
|
||||
- tags: exactly 2 or 3 from this list: {json.dumps(tags)}
|
||||
- title: a concise, normalized title (max 80 chars)
|
||||
- tags: ONLY from this predefined list. Do NOT create any new tags outside this list. If no tags fit, return an empty array.
|
||||
- title: a concise, normalized title in Title Case with spaces (max 80 chars, e.g. 'Machine Learning', 'Web Development')
|
||||
- summary: a 1-2 sentence summary of the content (for links/documents only)
|
||||
- corrected_text: for NOTES ONLY — return the original note text with spelling/grammar fixed. Keep the original meaning, tone, and structure. Only fix typos and obvious errors. Return empty string for non-notes.
|
||||
- confidence: a float 0.0-1.0 indicating how confident you are
|
||||
|
||||
Tag definitions (only assign tags that STRONGLY match the content):
|
||||
{tag_defs}
|
||||
|
||||
Rules:
|
||||
- NEVER invent folders or tags not in the lists above
|
||||
- Only assign tags that STRONGLY match the content. 1-2 tags is perfectly fine.
|
||||
- Do NOT pad with extra tags just to reach a target number. If only one tag fits, only use one.
|
||||
- If NO tags fit the content, return an empty tags array.
|
||||
- Name tags: 'yusuf', 'madiha', 'hafsa', or 'mustafa' ONLY when the content is a personal document belonging to that family member (look for their name in the title or content)
|
||||
- NEVER skip classification
|
||||
- NEVER return freeform text outside the schema
|
||||
- For notes: do NOT summarize. Keep the original text. Only fix spelling.
|
||||
@@ -43,7 +84,7 @@ def build_response_schema(folders: list[str], tags: list[str]) -> dict:
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "enum": tags},
|
||||
"minItems": 2,
|
||||
"minItems": 0,
|
||||
"maxItems": 3,
|
||||
},
|
||||
"title": {"type": "string"},
|
||||
@@ -88,8 +129,8 @@ async def classify_item(
|
||||
if not OPENAI_API_KEY:
|
||||
log.warning("No OPENAI_API_KEY set, returning defaults")
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"folder": "Home",
|
||||
"tags": ["reference"],
|
||||
"title": title or "Untitled",
|
||||
"summary": "No AI classification available",
|
||||
"confidence": 0.0,
|
||||
@@ -122,10 +163,8 @@ async def classify_item(
|
||||
|
||||
# Validate folder and tags are in allowed sets
|
||||
if result["folder"] not in folders:
|
||||
result["folder"] = folders[0] if folders else "Knowledge"
|
||||
result["folder"] = folders[0] if folders else "Home"
|
||||
result["tags"] = [t for t in result["tags"] if t in tags][:3]
|
||||
if len(result["tags"]) < 2:
|
||||
result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]
|
||||
|
||||
return result
|
||||
|
||||
@@ -133,8 +172,8 @@ async def classify_item(
|
||||
log.error(f"Classification attempt {attempt + 1} failed: {e}")
|
||||
if attempt == retries:
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"folder": "Home",
|
||||
"tags": ["reference"],
|
||||
"title": title or "Untitled",
|
||||
"summary": f"Classification failed: {e}",
|
||||
"confidence": 0.0,
|
||||
|
||||
@@ -1,162 +1,218 @@
|
||||
"""Content ingestion — fetch, extract, screenshot, archive."""
|
||||
"""Content ingestion — Playwright crawler for HTML, screenshots, og:image."""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from html.parser import HTMLParser
|
||||
from io import StringIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import BROWSERLESS_URL
|
||||
from app.config import CRAWLER_URL
|
||||
from app.services.storage import storage
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _HTMLTextExtractor(HTMLParser):
|
||||
"""Simple HTML to text converter."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._result = StringIO()
|
||||
self._skip = False
|
||||
self._skip_tags = {"script", "style", "noscript", "svg"}
|
||||
# ── YouTube helpers ──
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = False
|
||||
if tag in ("p", "div", "br", "h1", "h2", "h3", "h4", "li", "tr"):
|
||||
self._result.write("\n")
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self._skip:
|
||||
self._result.write(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
raw = self._result.getvalue()
|
||||
# Collapse whitespace
|
||||
lines = [line.strip() for line in raw.splitlines()]
|
||||
return "\n".join(line for line in lines if line)
|
||||
def _extract_youtube_id(url: str) -> str | None:
|
||||
patterns = [
|
||||
r'(?:youtube\.com/watch\?.*v=|youtu\.be/|youtube\.com/shorts/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
|
||||
]
|
||||
for pat in patterns:
|
||||
m = re.search(pat, url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def html_to_text(html: str) -> str:
|
||||
extractor = _HTMLTextExtractor()
|
||||
extractor.feed(html)
|
||||
return extractor.get_text()
|
||||
def _is_youtube_url(url: str) -> bool:
|
||||
return bool(_extract_youtube_id(url))
|
||||
|
||||
|
||||
def extract_title_from_html(html: str) -> str | None:
|
||||
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
||||
return match.group(1).strip() if match else None
|
||||
async def fetch_youtube_metadata(url: str) -> dict | None:
|
||||
"""Fetch YouTube video metadata via oEmbed. No API key needed."""
|
||||
video_id = _extract_youtube_id(url)
|
||||
if not video_id:
|
||||
return None
|
||||
|
||||
result = {
|
||||
"title": None,
|
||||
"description": None,
|
||||
"author": None,
|
||||
"thumbnail_url": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
||||
"video_id": video_id,
|
||||
"is_short": "/shorts/" in url,
|
||||
}
|
||||
|
||||
def extract_meta_description(html: str) -> str | None:
|
||||
match = re.search(
|
||||
r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
|
||||
html, re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
|
||||
async def fetch_url_content(url: str) -> dict:
|
||||
"""Fetch URL content. Returns dict with html, text, title, description, used_browserless."""
|
||||
result = {"html": None, "text": None, "title": None, "description": None, "used_browserless": False}
|
||||
|
||||
# Try HTTP-first extraction
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
||||
resp = await client.get(url, headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
|
||||
})
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
result["html"] = html
|
||||
result["text"] = html_to_text(html)
|
||||
result["title"] = extract_title_from_html(html)
|
||||
result["description"] = extract_meta_description(html)
|
||||
|
||||
# If extraction is weak (< 200 chars of text), try browserless
|
||||
if len(result["text"] or "") < 200:
|
||||
log.info(f"Weak extraction ({len(result['text'] or '')} chars), trying browserless")
|
||||
br = await fetch_with_browserless(url)
|
||||
if br and len(br.get("text", "")) > len(result["text"] or ""):
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
|
||||
resp = await client.get(oembed_url)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
result["title"] = data.get("title")
|
||||
result["author"] = data.get("author_name")
|
||||
|
||||
noembed_url = f"https://noembed.com/embed?url=https://www.youtube.com/watch?v={video_id}"
|
||||
resp2 = await client.get(noembed_url)
|
||||
if resp2.status_code == 200:
|
||||
data2 = resp2.json()
|
||||
if not result["title"]:
|
||||
result["title"] = data2.get("title")
|
||||
if not result["author"]:
|
||||
result["author"] = data2.get("author_name")
|
||||
except Exception as e:
|
||||
log.warning(f"HTTP fetch failed for {url}: {e}, trying browserless")
|
||||
try:
|
||||
br = await fetch_with_browserless(url)
|
||||
if br:
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
except Exception as e2:
|
||||
log.error(f"Browserless also failed for {url}: {e2}")
|
||||
log.warning(f"YouTube metadata fetch failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def fetch_with_browserless(url: str) -> dict | None:
|
||||
"""Use browserless/chrome to render JS-heavy pages."""
|
||||
async def download_youtube_thumbnail(url: str, item_id: str) -> str | None:
|
||||
"""Download YouTube thumbnail and save as screenshot asset."""
|
||||
video_id = _extract_youtube_id(url)
|
||||
if not video_id:
|
||||
return None
|
||||
|
||||
urls_to_try = [
|
||||
f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
||||
f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
|
||||
]
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/content",
|
||||
json={"url": url, "waitForTimeout": 3000},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
return {
|
||||
"html": html,
|
||||
"text": html_to_text(html),
|
||||
"title": extract_title_from_html(html),
|
||||
"description": extract_meta_description(html),
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
for thumb_url in urls_to_try:
|
||||
resp = await client.get(thumb_url)
|
||||
if resp.status_code == 200 and len(resp.content) > 1000:
|
||||
path = storage.save(
|
||||
item_id=item_id, asset_type="screenshot",
|
||||
filename="thumbnail.jpg", data=resp.content,
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Browserless fetch failed: {e}")
|
||||
log.warning(f"YouTube thumbnail download failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def take_screenshot(url: str, item_id: str) -> str | None:
|
||||
"""Take a screenshot of a URL using browserless. Returns storage path or None."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/screenshot",
|
||||
json={
|
||||
"url": url,
|
||||
"options": {"type": "png", "fullPage": False},
|
||||
"waitForTimeout": 3000,
|
||||
},
|
||||
async def download_youtube_video(url: str, item_id: str) -> tuple[str | None, dict]:
|
||||
"""Download YouTube video via yt-dlp."""
|
||||
import asyncio
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
video_id = _extract_youtube_id(url)
|
||||
if not video_id:
|
||||
return None, {}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
outpath = os.path.join(tmpdir, "%(id)s.%(ext)s")
|
||||
cmd = [
|
||||
"yt-dlp", "--no-playlist",
|
||||
"-f", "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best[height<=720]",
|
||||
"--merge-output-format", "mp4",
|
||||
"--write-info-json", "--no-write-playlist-metafiles",
|
||||
"-o", outpath, url,
|
||||
]
|
||||
try:
|
||||
proc = await asyncio.to_thread(
|
||||
subprocess.run, cmd, capture_output=True, text=True, timeout=120,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
log.warning(f"yt-dlp failed: {proc.stderr[:300]}")
|
||||
return None, {}
|
||||
|
||||
video_file = None
|
||||
info = {}
|
||||
for f in os.listdir(tmpdir):
|
||||
if f.endswith(".mp4"):
|
||||
video_file = os.path.join(tmpdir, f)
|
||||
elif f.endswith(".info.json"):
|
||||
import json as _json
|
||||
with open(os.path.join(tmpdir, f)) as fh:
|
||||
info = _json.load(fh)
|
||||
|
||||
if not video_file:
|
||||
return None, {}
|
||||
|
||||
file_data = open(video_file, "rb").read()
|
||||
path = storage.save(
|
||||
item_id=item_id, asset_type="video",
|
||||
filename=f"{video_id}.mp4", data=file_data,
|
||||
)
|
||||
log.info(f"Downloaded YouTube video: {len(file_data)} bytes -> {path}")
|
||||
return path, info
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning(f"yt-dlp timed out for {url}")
|
||||
return None, {}
|
||||
except Exception as e:
|
||||
log.error(f"YouTube download failed: {e}")
|
||||
return None, {}
|
||||
|
||||
|
||||
# ── Main crawler (Playwright stealth service) ──
|
||||
|
||||
async def crawl_url(url: str) -> dict:
|
||||
"""Call the Playwright crawler service. Returns dict with html, text, title,
|
||||
description, author, og_image_url, screenshot (base64), status_code, error."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=45) as client:
|
||||
resp = await client.post(f"{CRAWLER_URL}/crawl", json={"url": url})
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
log.warning(f"Crawler returned {resp.status_code} for {url}")
|
||||
except Exception as e:
|
||||
log.error(f"Crawler request failed for {url}: {e}")
|
||||
return {"url": url, "html": None, "text": None, "title": None,
|
||||
"description": None, "og_image_url": None, "screenshot": None, "error": str(e) if 'e' in dir() else "unknown"}
|
||||
|
||||
|
||||
async def save_screenshot_from_base64(b64: str, item_id: str) -> str | None:
|
||||
"""Decode base64 screenshot and save to storage."""
|
||||
try:
|
||||
data = base64.b64decode(b64)
|
||||
if len(data) < 500:
|
||||
return None
|
||||
path = storage.save(
|
||||
item_id=item_id, asset_type="screenshot",
|
||||
filename="screenshot.jpg", data=data,
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Screenshot save failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def download_og_image(og_url: str, item_id: str) -> str | None:
|
||||
"""Download an og:image and save as asset."""
|
||||
# Clean HTML entities from URL
|
||||
og_url = og_url.replace("&", "&")
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
||||
resp = await client.get(og_url, headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
|
||||
})
|
||||
if resp.status_code == 200 and len(resp.content) > 1000:
|
||||
ct = resp.headers.get("content-type", "image/jpeg")
|
||||
ext = "png" if "png" in ct else "jpg"
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
data=resp.content,
|
||||
item_id=item_id, asset_type="og_image",
|
||||
filename=f"og_image.{ext}", data=resp.content,
|
||||
)
|
||||
log.info(f"Downloaded og:image ({len(resp.content)} bytes) for {item_id}")
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Screenshot failed for {url}: {e}")
|
||||
log.warning(f"og:image download failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def archive_html(html: str, item_id: str) -> str | None:
|
||||
"""Save the full HTML as an archived asset."""
|
||||
"""Save full HTML as an archived asset."""
|
||||
if not html:
|
||||
return None
|
||||
try:
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="archived_html",
|
||||
filename="page.html",
|
||||
data=html.encode("utf-8"),
|
||||
item_id=item_id, asset_type="archived_html",
|
||||
filename="page.html", data=html.encode("utf-8"),
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
|
||||
@@ -12,6 +12,7 @@ from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import REDIS_URL, DATABASE_URL_SYNC
|
||||
from app.models.item import Item, ItemAsset
|
||||
from app.models.taxonomy import Folder, Tag, ItemTag # noqa: F401 — register FK targets
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@@ -34,7 +35,7 @@ async def _process_item(item_id: str):
|
||||
"""Full processing pipeline for a saved item."""
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
from app.config import DATABASE_URL
|
||||
from app.services.ingest import fetch_url_content, take_screenshot, archive_html
|
||||
from app.services.ingest import crawl_url, save_screenshot_from_base64, download_og_image, archive_html
|
||||
from app.services.classify import classify_item
|
||||
from app.services.embed import generate_embedding
|
||||
from app.search.engine import index_item, ensure_meili_index
|
||||
@@ -62,42 +63,96 @@ async def _process_item(item_id: str):
|
||||
|
||||
# ── Step 1: Fetch content for URLs ──
|
||||
if item.type == "link" and item.url:
|
||||
log.info(f"Fetching URL: {item.url}")
|
||||
content = await fetch_url_content(item.url)
|
||||
html_content = content.get("html")
|
||||
extracted_text = content.get("text") or extracted_text
|
||||
if not title:
|
||||
title = content.get("title")
|
||||
from app.services.ingest import (
|
||||
_is_youtube_url, download_youtube_thumbnail,
|
||||
download_youtube_video, fetch_youtube_metadata,
|
||||
)
|
||||
|
||||
item.metadata_json = item.metadata_json or {}
|
||||
item.metadata_json["description"] = content.get("description")
|
||||
item.metadata_json["used_browserless"] = content.get("used_browserless", False)
|
||||
is_yt = _is_youtube_url(item.url)
|
||||
|
||||
# Take screenshot
|
||||
screenshot_path = await take_screenshot(item.url, item.id)
|
||||
if screenshot_path:
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
content_type="image/png",
|
||||
storage_path=screenshot_path,
|
||||
)
|
||||
db.add(asset)
|
||||
if is_yt:
|
||||
# YouTube: use oEmbed + thumbnail + yt-dlp (no crawler needed)
|
||||
log.info(f"Processing YouTube URL: {item.url}")
|
||||
yt_meta = await fetch_youtube_metadata(item.url)
|
||||
if yt_meta:
|
||||
if not title:
|
||||
title = yt_meta.get("title")
|
||||
extracted_text = f"YouTube: {yt_meta.get('title','')}\nBy: {yt_meta.get('author','')}"
|
||||
item.metadata_json["youtube"] = {
|
||||
"video_id": yt_meta.get("video_id"),
|
||||
"author": yt_meta.get("author"),
|
||||
"is_short": yt_meta.get("is_short", False),
|
||||
}
|
||||
item.metadata_json["description"] = f"YouTube video by {yt_meta.get('author','')}"
|
||||
|
||||
# Archive HTML
|
||||
if html_content:
|
||||
html_path = await archive_html(html_content, item.id)
|
||||
if html_path:
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
asset_type="archived_html",
|
||||
filename="page.html",
|
||||
content_type="text/html",
|
||||
storage_path=html_path,
|
||||
)
|
||||
db.add(asset)
|
||||
# Download video
|
||||
log.info(f"Downloading YouTube video: {item.url}")
|
||||
video_path, yt_info = await download_youtube_video(item.url, item.id)
|
||||
if video_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="video", filename=f"{yt_meta['video_id']}.mp4",
|
||||
content_type="video/mp4", storage_path=video_path,
|
||||
))
|
||||
if yt_info.get("duration"):
|
||||
item.metadata_json["youtube"]["duration"] = yt_info["duration"]
|
||||
if yt_info.get("description"):
|
||||
item.metadata_json["youtube"]["description"] = yt_info["description"][:500]
|
||||
extracted_text = f"YouTube: {title or ''}\nBy: {(yt_meta or {}).get('author','')}\n{yt_info['description'][:2000]}"
|
||||
|
||||
# Thumbnail
|
||||
thumb_path = await download_youtube_thumbnail(item.url, item.id)
|
||||
if thumb_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="screenshot", filename="thumbnail.jpg",
|
||||
content_type="image/jpeg", storage_path=thumb_path,
|
||||
))
|
||||
|
||||
else:
|
||||
# Regular URL: use Playwright crawler (stealth)
|
||||
log.info(f"Crawling URL: {item.url}")
|
||||
crawl = await crawl_url(item.url)
|
||||
html_content = crawl.get("html")
|
||||
extracted_text = crawl.get("text") or extracted_text
|
||||
if not title:
|
||||
title = crawl.get("title")
|
||||
item.metadata_json["description"] = crawl.get("description")
|
||||
item.metadata_json["author"] = crawl.get("author")
|
||||
item.metadata_json["status_code"] = crawl.get("status_code")
|
||||
|
||||
# Screenshot (from crawler, base64 JPEG)
|
||||
if crawl.get("screenshot"):
|
||||
ss_path = await save_screenshot_from_base64(crawl["screenshot"], item.id)
|
||||
if ss_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="screenshot", filename="screenshot.jpg",
|
||||
content_type="image/jpeg", storage_path=ss_path,
|
||||
))
|
||||
|
||||
# og:image (extracted from rendered DOM by crawler)
|
||||
og_url = crawl.get("og_image_url")
|
||||
if og_url:
|
||||
og_path = await download_og_image(og_url, item.id)
|
||||
if og_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="og_image", filename="og_image.jpg",
|
||||
content_type="image/jpeg", storage_path=og_path,
|
||||
))
|
||||
item.metadata_json["og_image_url"] = og_url
|
||||
|
||||
# Archive HTML
|
||||
if html_content:
|
||||
html_path = await archive_html(html_content, item.id)
|
||||
if html_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="archived_html", filename="page.html",
|
||||
content_type="text/html", storage_path=html_path,
|
||||
))
|
||||
|
||||
# ── Step 1b: Process uploaded files (PDF, image, document) ──
|
||||
if item.type in ("pdf", "image", "document", "file"):
|
||||
|
||||
20
services/brain/crawler/Dockerfile
Normal file
20
services/brain/crawler/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
FROM node:20-slim
|
||||
|
||||
# Install Playwright system dependencies
|
||||
RUN npx playwright@1.50.0 install-deps chromium
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json ./
|
||||
RUN npm install
|
||||
# Install Playwright browser
|
||||
RUN npx playwright install chromium
|
||||
|
||||
COPY server.js ./
|
||||
|
||||
ENV NODE_ENV=production
|
||||
EXPOSE 3100
|
||||
|
||||
HEALTHCHECK --interval=15s --timeout=5s --retries=3 CMD wget -qO- http://localhost:3100/health || exit 1
|
||||
|
||||
CMD ["node", "server.js"]
|
||||
24
services/brain/crawler/package.json
Normal file
24
services/brain/crawler/package.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"name": "brain-crawler",
|
||||
"version": "1.0.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"start": "node server.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"playwright-extra": "^4.3.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"playwright-core": "^1.50.0",
|
||||
"metascraper": "^5.45.25",
|
||||
"metascraper-image": "^5.45.25",
|
||||
"metascraper-title": "^5.45.25",
|
||||
"metascraper-description": "^5.45.25",
|
||||
"metascraper-author": "^5.45.25",
|
||||
"metascraper-date": "^5.45.25",
|
||||
"metascraper-publisher": "^5.45.25",
|
||||
"metascraper-url": "^5.45.25",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"jsdom": "^25.0.0"
|
||||
}
|
||||
}
|
||||
370
services/brain/crawler/server.js
Normal file
370
services/brain/crawler/server.js
Normal file
@@ -0,0 +1,370 @@
|
||||
import http from "node:http";
|
||||
import { chromium } from "playwright-extra";
|
||||
import StealthPlugin from "puppeteer-extra-plugin-stealth";
|
||||
import { Readability } from "@mozilla/readability";
|
||||
import { JSDOM } from "jsdom";
|
||||
|
||||
chromium.use(StealthPlugin());
|
||||
|
||||
const PORT = parseInt(process.env.PORT || "3100");
|
||||
const VIEWPORT = { width: 1440, height: 900 };
|
||||
const USER_AGENT =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
||||
const NAV_TIMEOUT = 30_000;
|
||||
const SCREENSHOT_TIMEOUT = 8_000;
|
||||
|
||||
let browser = null;
|
||||
|
||||
async function ensureBrowser() {
|
||||
if (browser && browser.isConnected()) return browser;
|
||||
if (browser) {
|
||||
try { await browser.close(); } catch {}
|
||||
browser = null;
|
||||
}
|
||||
console.log("[crawler] Launching browser...");
|
||||
browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu",
|
||||
],
|
||||
});
|
||||
console.log("[crawler] Browser ready");
|
||||
return browser;
|
||||
}
|
||||
|
||||
// Extract og:image and other meta from rendered HTML
|
||||
function extractMeta(html) {
|
||||
const meta = {};
|
||||
|
||||
const patterns = {
|
||||
og_image: [
|
||||
/(?:property|name)=["']og:image["'][^>]*content=["']([^"']+)["']/i,
|
||||
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:image["']/i,
|
||||
],
|
||||
title: [
|
||||
/(?:property|name)=["']og:title["'][^>]*content=["']([^"']+)["']/i,
|
||||
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:title["']/i,
|
||||
/<title[^>]*>([^<]+)<\/title>/i,
|
||||
],
|
||||
description: [
|
||||
/(?:property|name)=["']og:description["'][^>]*content=["']([^"']+)["']/i,
|
||||
/name=["']description["'][^>]*content=["']([^"']+)["']/i,
|
||||
/content=["']([^"']+)["'][^>]*(?:property|name)=["']og:description["']/i,
|
||||
],
|
||||
author: [
|
||||
/name=["']author["'][^>]*content=["']([^"']+)["']/i,
|
||||
/property=["']article:author["'][^>]*content=["']([^"']+)["']/i,
|
||||
],
|
||||
favicon: [
|
||||
/rel=["']icon["'][^>]*href=["']([^"']+)["']/i,
|
||||
/rel=["']shortcut icon["'][^>]*href=["']([^"']+)["']/i,
|
||||
],
|
||||
};
|
||||
|
||||
for (const [key, pats] of Object.entries(patterns)) {
|
||||
for (const pat of pats) {
|
||||
const m = html.match(pat);
|
||||
if (m) {
|
||||
meta[key] = m[1].trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return meta;
|
||||
}
|
||||
|
||||
function isRedditUrl(url) {
|
||||
try {
|
||||
const h = new URL(url).hostname;
|
||||
return h === "www.reddit.com" || h === "reddit.com";
|
||||
} catch {}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function resolveRedditShortUrl(url) {
|
||||
// Reddit short URLs (/r/sub/s/xxx) redirect to the actual post
|
||||
if (/\/s\/[a-zA-Z0-9]+/.test(url)) {
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
method: "HEAD",
|
||||
redirect: "follow",
|
||||
headers: { "User-Agent": "SecondBrain/1.0" },
|
||||
});
|
||||
const resolved = resp.url;
|
||||
if (resolved && resolved.includes("/comments/")) {
|
||||
console.log(`[crawler] Reddit short URL resolved: ${url} -> ${resolved}`);
|
||||
return resolved;
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn("[crawler] Reddit short URL resolve failed:", e.message);
|
||||
}
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
async function fetchRedditJson(url) {
|
||||
// Resolve short URLs first
|
||||
url = await resolveRedditShortUrl(url);
|
||||
|
||||
// Reddit JSON API — append .json to get structured data
|
||||
try {
|
||||
const jsonUrl = url.replace(/\/?(\?.*)?$/, "/.json$1");
|
||||
const resp = await fetch(jsonUrl, {
|
||||
headers: { "User-Agent": "SecondBrain/1.0" },
|
||||
redirect: "follow",
|
||||
});
|
||||
if (!resp.ok) return null;
|
||||
const data = await resp.json();
|
||||
const post = data?.[0]?.data?.children?.[0]?.data;
|
||||
if (!post) return null;
|
||||
|
||||
const previewImg = (post.preview?.images?.[0]?.source?.url || "").replace(/&/g, "&") || null;
|
||||
const thumbnail = post.thumbnail?.startsWith("http") ? post.thumbnail : null;
|
||||
|
||||
// If no preview image, try to get subreddit icon
|
||||
let ogImage = previewImg || thumbnail || null;
|
||||
if (!ogImage && post.subreddit) {
|
||||
try {
|
||||
const aboutResp = await fetch(
|
||||
`https://www.reddit.com/r/${post.subreddit}/about.json`,
|
||||
{ headers: { "User-Agent": "SecondBrain/1.0" } }
|
||||
);
|
||||
if (aboutResp.ok) {
|
||||
const about = await aboutResp.json();
|
||||
const icon = about?.data?.community_icon?.replace(/&/g, "&")?.split("?")?.[0]
|
||||
|| about?.data?.icon_img
|
||||
|| about?.data?.header_img;
|
||||
if (icon && icon.startsWith("http")) {
|
||||
ogImage = icon;
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
|
||||
return {
|
||||
url,
|
||||
html: null,
|
||||
text: `${post.title || ""}\n\n${post.selftext || ""}`.trim(),
|
||||
title: post.title || null,
|
||||
description: (post.selftext || "").slice(0, 200) || null,
|
||||
author: post.author ? `u/${post.author}` : null,
|
||||
og_image_url: ogImage ? ogImage.replace(/&/g, "&") : null,
|
||||
favicon: null,
|
||||
screenshot: null,
|
||||
status_code: 200,
|
||||
error: null,
|
||||
subreddit: post.subreddit_name_prefixed || null,
|
||||
};
|
||||
} catch (e) {
|
||||
console.warn("[crawler] Reddit JSON failed:", e.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function crawl(url) {
|
||||
// Reddit: use JSON API (avoids login walls entirely)
|
||||
if (isRedditUrl(url)) {
|
||||
const redditData = await fetchRedditJson(url);
|
||||
if (redditData) {
|
||||
console.log(`[crawler] Reddit JSON OK: ${url} (og=${!!redditData.og_image_url})`);
|
||||
return redditData;
|
||||
}
|
||||
console.log(`[crawler] Reddit JSON failed, falling back to browser: ${url}`);
|
||||
}
|
||||
|
||||
const crawlUrl = url;
|
||||
let b;
|
||||
try {
|
||||
b = await ensureBrowser();
|
||||
} catch (e) {
|
||||
console.error("[crawler] Browser launch failed, retrying:", e.message);
|
||||
browser = null;
|
||||
b = await ensureBrowser();
|
||||
}
|
||||
const contextOpts = {
|
||||
viewport: VIEWPORT,
|
||||
userAgent: USER_AGENT,
|
||||
ignoreHTTPSErrors: true,
|
||||
};
|
||||
|
||||
// Reddit: set cookies to bypass login walls
|
||||
if (isRedditUrl(url)) {
|
||||
contextOpts.extraHTTPHeaders = {
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
};
|
||||
}
|
||||
|
||||
const context = await b.newContext(contextOpts);
|
||||
|
||||
const page = await context.newPage();
|
||||
const result = {
|
||||
url,
|
||||
html: null,
|
||||
text: null,
|
||||
readable_html: null,
|
||||
title: null,
|
||||
description: null,
|
||||
author: null,
|
||||
og_image_url: null,
|
||||
favicon: null,
|
||||
screenshot: null, // base64
|
||||
status_code: null,
|
||||
error: null,
|
||||
};
|
||||
|
||||
try {
|
||||
// Navigate (use normalized URL to avoid login walls)
|
||||
const response = await page.goto(crawlUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: NAV_TIMEOUT,
|
||||
});
|
||||
result.status_code = response?.status() || null;
|
||||
|
||||
// Wait for network to settle (up to 5s)
|
||||
try {
|
||||
await page.waitForLoadState("networkidle", { timeout: 5000 });
|
||||
} catch {
|
||||
// networkidle timeout is fine, page is probably loaded enough
|
||||
}
|
||||
|
||||
// Reddit: dismiss login modals and overlays
|
||||
if (isRedditUrl(url)) {
|
||||
await page.evaluate(() => {
|
||||
// Remove login modal/overlay
|
||||
document.querySelectorAll('shreddit-overlay-display, [id*="login"], .overlay-container, reddit-cookie-banner').forEach(el => el.remove());
|
||||
// Remove any body scroll locks
|
||||
document.body.style.overflow = 'auto';
|
||||
document.documentElement.style.overflow = 'auto';
|
||||
}).catch(() => {});
|
||||
await page.waitForTimeout(1000);
|
||||
}
|
||||
|
||||
// Get rendered HTML + screenshot in parallel
|
||||
const [html, screenshot] = await Promise.all([
|
||||
page.content(),
|
||||
page
|
||||
.screenshot({ type: "jpeg", quality: 80, fullPage: false })
|
||||
.catch((e) => {
|
||||
console.warn("[crawler] Screenshot failed:", e.message);
|
||||
return null;
|
||||
}),
|
||||
]);
|
||||
|
||||
result.html = html;
|
||||
|
||||
// Extract text from page
|
||||
result.text = await page
|
||||
.evaluate(() => {
|
||||
const el =
|
||||
document.querySelector("article") ||
|
||||
document.querySelector("main") ||
|
||||
document.querySelector('[role="main"]') ||
|
||||
document.body;
|
||||
return el ? el.innerText.slice(0, 10000) : "";
|
||||
})
|
||||
.catch(() => "");
|
||||
|
||||
// Extract readable article HTML via Mozilla Readability
|
||||
try {
|
||||
const dom = new JSDOM(html, { url: crawlUrl });
|
||||
const reader = new Readability(dom.window.document);
|
||||
const article = reader.parse();
|
||||
if (article && article.content) {
|
||||
result.readable_html = article.content;
|
||||
if (article.textContent) {
|
||||
result.text = article.textContent.slice(0, 10000);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn("[crawler] Readability failed:", e.message);
|
||||
}
|
||||
|
||||
// Extract meta from rendered DOM
|
||||
const meta = extractMeta(html);
|
||||
result.title = meta.title || (await page.title()) || null;
|
||||
result.description = meta.description || null;
|
||||
result.author = meta.author || null;
|
||||
result.og_image_url = meta.og_image || null;
|
||||
result.favicon = meta.favicon || null;
|
||||
|
||||
// Screenshot as base64
|
||||
if (screenshot) {
|
||||
result.screenshot = screenshot.toString("base64");
|
||||
}
|
||||
} catch (e) {
|
||||
result.error = e.message;
|
||||
console.error("[crawler] Crawl error:", url, e.message);
|
||||
// If browser crashed, reset it for next request
|
||||
if (e.message.includes("closed") || e.message.includes("crashed")) {
|
||||
browser = null;
|
||||
}
|
||||
} finally {
|
||||
await page.close().catch(() => {});
|
||||
await context.close().catch(() => {});
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Simple HTTP server
|
||||
const server = http.createServer(async (req, res) => {
|
||||
// Health check
|
||||
if (req.method === "GET" && req.url === "/health") {
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ status: "ok" }));
|
||||
return;
|
||||
}
|
||||
|
||||
// Crawl endpoint
|
||||
if (req.method === "POST" && req.url === "/crawl") {
|
||||
let body = "";
|
||||
req.on("data", (chunk) => (body += chunk));
|
||||
req.on("end", async () => {
|
||||
try {
|
||||
const { url } = JSON.parse(body);
|
||||
if (!url) {
|
||||
res.writeHead(400, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ error: "url is required" }));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[crawler] Crawling: ${url}`);
|
||||
const result = await crawl(url);
|
||||
console.log(
|
||||
`[crawler] Done: ${url} (status=${result.status_code}, og=${!!result.og_image_url}, ss=${!!result.screenshot})`
|
||||
);
|
||||
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify(result));
|
||||
} catch (e) {
|
||||
console.error("[crawler] Request error:", e);
|
||||
res.writeHead(500, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ error: e.message }));
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
res.writeHead(404);
|
||||
res.end("Not found");
|
||||
});
|
||||
|
||||
// Startup
|
||||
(async () => {
|
||||
await ensureBrowser();
|
||||
server.listen(PORT, () => {
|
||||
console.log(`[crawler] Listening on :${PORT}`);
|
||||
});
|
||||
})();
|
||||
|
||||
// Graceful shutdown
|
||||
process.on("SIGTERM", async () => {
|
||||
console.log("[crawler] Shutting down...");
|
||||
if (browser) await browser.close().catch(() => {});
|
||||
process.exit(0);
|
||||
});
|
||||
@@ -13,7 +13,7 @@ services:
|
||||
- REDIS_URL=redis://brain-redis:6379/0
|
||||
- MEILI_URL=http://brain-meili:7700
|
||||
- MEILI_MASTER_KEY=${MEILI_MASTER_KEY:-brain-meili-secure-key-2026}
|
||||
- BROWSERLESS_URL=http://brain-browserless:3000
|
||||
- CRAWLER_URL=http://brain-crawler:3100
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o-mini}
|
||||
- PORT=8200
|
||||
@@ -44,7 +44,7 @@ services:
|
||||
- REDIS_URL=redis://brain-redis:6379/0
|
||||
- MEILI_URL=http://brain-meili:7700
|
||||
- MEILI_MASTER_KEY=${MEILI_MASTER_KEY:-brain-meili-secure-key-2026}
|
||||
- BROWSERLESS_URL=http://brain-browserless:3000
|
||||
- CRAWLER_URL=http://brain-crawler:3100
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o-mini}
|
||||
- TZ=${TZ:-America/Chicago}
|
||||
@@ -90,14 +90,17 @@ services:
|
||||
volumes:
|
||||
- ./data/meili:/meili_data
|
||||
|
||||
# ── Browserless (headless Chrome for JS rendering + screenshots) ──
|
||||
brain-browserless:
|
||||
image: ghcr.io/browserless/chromium:latest
|
||||
container_name: brain-browserless
|
||||
# ── Crawler (Playwright + stealth for JS rendering + screenshots) ──
|
||||
brain-crawler:
|
||||
build:
|
||||
context: ./crawler
|
||||
dockerfile: Dockerfile
|
||||
container_name: brain-crawler
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- MAX_CONCURRENT_SESSIONS=3
|
||||
- TIMEOUT=30000
|
||||
- PORT=3100
|
||||
- TZ=${TZ:-America/Chicago}
|
||||
shm_size: '1gb'
|
||||
|
||||
networks:
|
||||
pangolin:
|
||||
|
||||
254
services/brain/migrate_karakeep.py
Normal file
254
services/brain/migrate_karakeep.py
Normal file
@@ -0,0 +1,254 @@
|
||||
"""Migrate all bookmarks from Karakeep into Brain service via API."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import tempfile
|
||||
|
||||
KARAKEEP_URL = os.environ.get("KARAKEEP_URL", "http://192.168.1.42:3005")
|
||||
KARAKEEP_API_KEY = os.environ.get("KARAKEEP_API_KEY", "ak2_f4141e5fe7265e23bd6f_4549c932c262010eafd08acb2139f1ac")
|
||||
BRAIN_URL = "http://localhost:8200"
|
||||
BRAIN_USER = "admin"
|
||||
|
||||
|
||||
def karakeep_get(path):
|
||||
req = urllib.request.Request(
|
||||
f"{KARAKEEP_URL}{path}",
|
||||
headers={"Authorization": f"Bearer {KARAKEEP_API_KEY}"},
|
||||
)
|
||||
return json.loads(urllib.request.urlopen(req, timeout=30).read())
|
||||
|
||||
|
||||
def karakeep_download(asset_id):
|
||||
req = urllib.request.Request(
|
||||
f"{KARAKEEP_URL}/api/v1/assets/{asset_id}",
|
||||
headers={"Authorization": f"Bearer {KARAKEEP_API_KEY}"},
|
||||
)
|
||||
resp = urllib.request.urlopen(req, timeout=120)
|
||||
return resp.read(), resp.headers.get("Content-Type", "application/octet-stream")
|
||||
|
||||
|
||||
def brain_post_json(path, data):
|
||||
body = json.dumps(data).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BRAIN_URL}/api{path}",
|
||||
data=body,
|
||||
headers={"X-Gateway-User-Id": BRAIN_USER, "Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def brain_upload(file_data, filename, content_type, title=None):
|
||||
"""Multipart upload to /api/items/upload."""
|
||||
boundary = "----MigrationBoundary12345"
|
||||
parts = []
|
||||
|
||||
# File part
|
||||
parts.append(f"--{boundary}\r\n".encode())
|
||||
parts.append(f'Content-Disposition: form-data; name="file"; filename="{filename}"\r\n'.encode())
|
||||
parts.append(f"Content-Type: {content_type}\r\n\r\n".encode())
|
||||
parts.append(file_data)
|
||||
parts.append(b"\r\n")
|
||||
|
||||
# Title part
|
||||
if title:
|
||||
parts.append(f"--{boundary}\r\n".encode())
|
||||
parts.append(b'Content-Disposition: form-data; name="title"\r\n\r\n')
|
||||
parts.append(title.encode())
|
||||
parts.append(b"\r\n")
|
||||
|
||||
parts.append(f"--{boundary}--\r\n".encode())
|
||||
body = b"".join(parts)
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{BRAIN_URL}/api/items/upload",
|
||||
data=body,
|
||||
headers={
|
||||
"X-Gateway-User-Id": BRAIN_USER,
|
||||
"Content-Type": f"multipart/form-data; boundary={boundary}",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
resp = urllib.request.urlopen(req, timeout=60)
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def brain_get_item(item_id):
|
||||
req = urllib.request.Request(
|
||||
f"{BRAIN_URL}/api/items/{item_id}",
|
||||
headers={"X-Gateway-User-Id": BRAIN_USER},
|
||||
)
|
||||
resp = urllib.request.urlopen(req, timeout=15)
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def fetch_all_bookmarks():
|
||||
all_bk = []
|
||||
cursor = None
|
||||
while True:
|
||||
url = "/api/v1/bookmarks?limit=100"
|
||||
if cursor:
|
||||
url += f"&cursor={cursor}"
|
||||
data = karakeep_get(url)
|
||||
bks = data.get("bookmarks", [])
|
||||
all_bk.extend(bks)
|
||||
cursor = data.get("nextCursor")
|
||||
if not cursor or not bks:
|
||||
break
|
||||
return all_bk
|
||||
|
||||
|
||||
def wait_for_processing(item_id, timeout=120):
|
||||
"""Poll until item is done processing."""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
item = brain_get_item(item_id)
|
||||
status = item.get("processing_status", "pending")
|
||||
if status in ("ready", "error"):
|
||||
return item
|
||||
time.sleep(3)
|
||||
return brain_get_item(item_id)
|
||||
|
||||
|
||||
def main():
|
||||
print("Fetching all Karakeep bookmarks...")
|
||||
bookmarks = fetch_all_bookmarks()
|
||||
print(f"Found {len(bookmarks)} bookmarks\n")
|
||||
|
||||
# Sort: notes first, then links, then assets (PDFs take longer)
|
||||
def sort_key(b):
|
||||
t = b.get("content", {}).get("type", "")
|
||||
return {"text": 0, "link": 1, "asset": 2}.get(t, 3)
|
||||
bookmarks.sort(key=sort_key)
|
||||
|
||||
results = {"success": 0, "error": 0, "skipped": 0}
|
||||
comparison = []
|
||||
|
||||
for i, bk in enumerate(bookmarks):
|
||||
content = bk.get("content", {})
|
||||
bk_type = content.get("type", "unknown")
|
||||
bk_title = bk.get("title") or "Untitled"
|
||||
bk_tags = [t["name"] for t in bk.get("tags", [])]
|
||||
bk_list = bk.get("list", {})
|
||||
bk_folder = bk_list.get("name") if bk_list else None
|
||||
|
||||
print(f"[{i+1}/{len(bookmarks)}] {bk_type}: {bk_title[:60]}")
|
||||
|
||||
try:
|
||||
if bk_type == "link":
|
||||
url = content.get("url", "")
|
||||
if not url:
|
||||
print(" SKIP: no URL")
|
||||
results["skipped"] += 1
|
||||
continue
|
||||
resp = brain_post_json("/items", {
|
||||
"type": "link",
|
||||
"url": url,
|
||||
"title": bk_title if bk_title != "Untitled" else None,
|
||||
})
|
||||
|
||||
elif bk_type == "text":
|
||||
text = content.get("text", "")
|
||||
if not text:
|
||||
print(" SKIP: no text")
|
||||
results["skipped"] += 1
|
||||
continue
|
||||
resp = brain_post_json("/items", {
|
||||
"type": "note",
|
||||
"raw_content": text,
|
||||
"title": bk_title if bk_title != "Untitled" else None,
|
||||
})
|
||||
|
||||
elif bk_type == "asset":
|
||||
asset_id = content.get("assetId")
|
||||
asset_type = content.get("assetType", "unknown")
|
||||
if not asset_id:
|
||||
print(" SKIP: no assetId")
|
||||
results["skipped"] += 1
|
||||
continue
|
||||
|
||||
print(f" Downloading {asset_type} ({asset_id[:8]})...")
|
||||
file_data, ct = karakeep_download(asset_id)
|
||||
ext = {"pdf": ".pdf", "image": ".png"}.get(asset_type, ".bin")
|
||||
filename = f"{bk_title[:50]}{ext}" if bk_title != "Untitled" else f"upload{ext}"
|
||||
# Clean filename
|
||||
filename = filename.replace("/", "-").replace("\\", "-")
|
||||
if asset_type == "pdf":
|
||||
ct = "application/pdf"
|
||||
resp = brain_upload(file_data, filename, ct, title=bk_title if bk_title != "Untitled" else None)
|
||||
else:
|
||||
print(f" SKIP: unknown type '{bk_type}'")
|
||||
results["skipped"] += 1
|
||||
continue
|
||||
|
||||
item_id = resp.get("id")
|
||||
print(f" Created: {item_id} — waiting for AI classification...")
|
||||
|
||||
# Wait for processing
|
||||
final = wait_for_processing(item_id, timeout=90)
|
||||
status = final.get("processing_status", "?")
|
||||
ai_folder = final.get("folder", "?")
|
||||
ai_tags = final.get("tags", [])
|
||||
ai_title = final.get("title", "?")
|
||||
|
||||
# Compare
|
||||
entry = {
|
||||
"karakeep_title": bk_title,
|
||||
"karakeep_tags": bk_tags,
|
||||
"karakeep_folder": bk_folder,
|
||||
"ai_title": ai_title,
|
||||
"ai_folder": ai_folder,
|
||||
"ai_tags": ai_tags,
|
||||
"status": status,
|
||||
}
|
||||
comparison.append(entry)
|
||||
|
||||
tag_match = "OK" if set(bk_tags) & set(ai_tags) or (not bk_tags and not ai_tags) else "DIFF"
|
||||
|
||||
print(f" Status: {status}")
|
||||
print(f" AI Folder: {ai_folder} (Karakeep: {bk_folder or 'none'})")
|
||||
print(f" AI Tags: {ai_tags} vs Karakeep: {bk_tags} [{tag_match}]")
|
||||
print(f" AI Title: {ai_title}")
|
||||
|
||||
results["success"] += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
results["error"] += 1
|
||||
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 60)
|
||||
print(f"MIGRATION COMPLETE")
|
||||
print(f" Success: {results['success']}")
|
||||
print(f" Errors: {results['error']}")
|
||||
print(f" Skipped: {results['skipped']}")
|
||||
print()
|
||||
|
||||
# Tag comparison summary
|
||||
matches = 0
|
||||
diffs = 0
|
||||
for c in comparison:
|
||||
kk = set(c["karakeep_tags"])
|
||||
ai = set(c["ai_tags"])
|
||||
if kk & ai or (not kk and not ai):
|
||||
matches += 1
|
||||
else:
|
||||
diffs += 1
|
||||
print(f"Tag overlap: {matches}/{len(comparison)} items had at least one matching tag")
|
||||
print(f"Tag differences: {diffs}/{len(comparison)} items had zero overlap")
|
||||
|
||||
# Save comparison
|
||||
with open("/tmp/migration_comparison.json", "w") as f:
|
||||
json.dump(comparison, f, indent=2)
|
||||
print("\nFull comparison saved to /tmp/migration_comparison.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -2035,6 +2035,24 @@ class CalorieHandler(BaseHTTPRequestHandler):
|
||||
if user:
|
||||
return user
|
||||
|
||||
# Internal gateway auth (trusted Docker network only)
|
||||
gateway_user_id = self.headers.get('X-Gateway-User-Id', '')
|
||||
gateway_user_name = self.headers.get('X-Gateway-User-Name', '')
|
||||
if gateway_user_id:
|
||||
conn = get_db()
|
||||
row = None
|
||||
# Try username match
|
||||
if gateway_user_name:
|
||||
row = conn.execute("SELECT * FROM users WHERE username = ? COLLATE NOCASE", (gateway_user_name.lower(),)).fetchone()
|
||||
# Try display name match
|
||||
if not row:
|
||||
row = conn.execute("SELECT * FROM users WHERE display_name = ? COLLATE NOCASE", (gateway_user_name,)).fetchone()
|
||||
if not row:
|
||||
row = conn.execute("SELECT * FROM users WHERE id = ?", (gateway_user_id,)).fetchone()
|
||||
conn.close()
|
||||
if row:
|
||||
return dict(row)
|
||||
|
||||
return None
|
||||
|
||||
def _send_json(self, data, status=200):
|
||||
|
||||
22
services/reader/Dockerfile.api
Normal file
22
services/reader/Dockerfile.api
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev && rm -rf /var/lib/apt/lists/*
|
||||
RUN pip install --no-cache-dir --upgrade pip
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
RUN adduser --disabled-password --no-create-home appuser
|
||||
|
||||
COPY --chown=appuser app/ app/
|
||||
|
||||
EXPOSE 8300
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
|
||||
CMD python3 -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8300/api/health', timeout=3)" || exit 1
|
||||
|
||||
USER appuser
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8300"]
|
||||
18
services/reader/Dockerfile.worker
Normal file
18
services/reader/Dockerfile.worker
Normal file
@@ -0,0 +1,18 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev && rm -rf /var/lib/apt/lists/*
|
||||
RUN pip install --no-cache-dir --upgrade pip
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
RUN adduser --disabled-password --no-create-home appuser
|
||||
|
||||
COPY --chown=appuser app/ app/
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
USER appuser
|
||||
CMD ["python", "-m", "app.worker.tasks"]
|
||||
0
services/reader/app/__init__.py
Normal file
0
services/reader/app/__init__.py
Normal file
0
services/reader/app/api/__init__.py
Normal file
0
services/reader/app/api/__init__.py
Normal file
49
services/reader/app/api/categories.py
Normal file
49
services/reader/app/api/categories.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""Category endpoints."""
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.api.deps import get_user_id, get_db_session
|
||||
from app.models import Category
|
||||
|
||||
router = APIRouter(prefix="/api/categories", tags=["categories"])
|
||||
|
||||
|
||||
class CategoryOut(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class CategoryCreate(BaseModel):
|
||||
title: str
|
||||
|
||||
|
||||
@router.get("", response_model=list[CategoryOut])
|
||||
async def list_categories(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Category)
|
||||
.where(Category.user_id == user_id)
|
||||
.order_by(Category.title)
|
||||
)
|
||||
return result.scalars().all()
|
||||
|
||||
|
||||
@router.post("", response_model=CategoryOut, status_code=201)
|
||||
async def create_category(
|
||||
body: CategoryCreate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
cat = Category(user_id=user_id, title=body.title)
|
||||
db.add(cat)
|
||||
await db.commit()
|
||||
await db.refresh(cat)
|
||||
return cat
|
||||
21
services/reader/app/api/deps.py
Normal file
21
services/reader/app/api/deps.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""API dependencies — auth, database session."""
|
||||
|
||||
from fastapi import Header, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import get_db
|
||||
|
||||
|
||||
async def get_user_id(
|
||||
x_gateway_user_id: str = Header(None, alias="X-Gateway-User-Id"),
|
||||
) -> str:
|
||||
"""Extract authenticated user ID from gateway-injected header."""
|
||||
if not x_gateway_user_id:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
return x_gateway_user_id
|
||||
|
||||
|
||||
async def get_db_session() -> AsyncSession:
|
||||
"""Provide an async database session."""
|
||||
async for session in get_db():
|
||||
yield session
|
||||
264
services/reader/app/api/entries.py
Normal file
264
services/reader/app/api/entries.py
Normal file
@@ -0,0 +1,264 @@
|
||||
"""Entry endpoints."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import func, select, update
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.api.deps import get_db_session, get_user_id
|
||||
from app.config import CRAWLER_URL
|
||||
from app.models import Entry, Feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api/entries", tags=["entries"])
|
||||
|
||||
|
||||
# ── Schemas ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class FeedRef(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class EntryOut(BaseModel):
|
||||
id: int
|
||||
title: str | None = None
|
||||
url: str | None = None
|
||||
content: str | None = None
|
||||
full_content: str | None = None
|
||||
author: str | None = None
|
||||
published_at: str | None = None
|
||||
status: str = "unread"
|
||||
starred: bool = False
|
||||
reading_time: int = 1
|
||||
feed: FeedRef | None = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
@classmethod
|
||||
def from_entry(cls, entry: Entry) -> "EntryOut":
|
||||
# Use full_content if available, otherwise RSS content
|
||||
best_content = entry.full_content if entry.full_content else entry.content
|
||||
return cls(
|
||||
id=entry.id,
|
||||
title=entry.title,
|
||||
url=entry.url,
|
||||
content=best_content,
|
||||
full_content=entry.full_content,
|
||||
author=entry.author,
|
||||
published_at=entry.published_at.isoformat() if entry.published_at else None,
|
||||
status=entry.status,
|
||||
starred=entry.starred,
|
||||
reading_time=entry.reading_time,
|
||||
feed=FeedRef(id=entry.feed.id, title=entry.feed.title) if entry.feed else None,
|
||||
)
|
||||
|
||||
|
||||
class EntryListOut(BaseModel):
|
||||
total: int
|
||||
entries: list[EntryOut]
|
||||
|
||||
|
||||
class EntryBulkUpdate(BaseModel):
|
||||
entry_ids: list[int]
|
||||
status: str
|
||||
|
||||
|
||||
# ── Routes ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@router.get("", response_model=EntryListOut)
|
||||
async def list_entries(
|
||||
status: Optional[str] = Query(None),
|
||||
starred: Optional[bool] = Query(None),
|
||||
feed_id: Optional[int] = Query(None),
|
||||
category_id: Optional[int] = Query(None),
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0),
|
||||
direction: str = Query("desc"),
|
||||
order: str = Query("published_at"),
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
query = select(Entry).where(Entry.user_id == user_id)
|
||||
count_query = select(func.count(Entry.id)).where(Entry.user_id == user_id)
|
||||
|
||||
if status:
|
||||
query = query.where(Entry.status == status)
|
||||
count_query = count_query.where(Entry.status == status)
|
||||
|
||||
if starred is not None:
|
||||
query = query.where(Entry.starred == starred)
|
||||
count_query = count_query.where(Entry.starred == starred)
|
||||
|
||||
if feed_id is not None:
|
||||
query = query.where(Entry.feed_id == feed_id)
|
||||
count_query = count_query.where(Entry.feed_id == feed_id)
|
||||
|
||||
if category_id is not None:
|
||||
# Join through feed to filter by category
|
||||
query = query.join(Feed, Entry.feed_id == Feed.id).where(Feed.category_id == category_id)
|
||||
count_query = count_query.join(Feed, Entry.feed_id == Feed.id).where(Feed.category_id == category_id)
|
||||
|
||||
# Ordering
|
||||
order_col = Entry.published_at if order == "published_at" else Entry.created_at
|
||||
if direction == "asc":
|
||||
query = query.order_by(order_col.asc().nullslast())
|
||||
else:
|
||||
query = query.order_by(order_col.desc().nullsfirst())
|
||||
|
||||
# Total count
|
||||
total_result = await db.execute(count_query)
|
||||
total = total_result.scalar() or 0
|
||||
|
||||
# Paginate
|
||||
query = query.options(selectinload(Entry.feed)).offset(offset).limit(limit)
|
||||
result = await db.execute(query)
|
||||
entries = result.scalars().all()
|
||||
|
||||
return EntryListOut(
|
||||
total=total,
|
||||
entries=[EntryOut.from_entry(e) for e in entries],
|
||||
)
|
||||
|
||||
|
||||
@router.put("")
|
||||
async def bulk_update_entries(
|
||||
body: EntryBulkUpdate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
if body.status not in ("read", "unread"):
|
||||
raise HTTPException(status_code=400, detail="Status must be 'read' or 'unread'")
|
||||
|
||||
await db.execute(
|
||||
update(Entry)
|
||||
.where(Entry.user_id == user_id, Entry.id.in_(body.entry_ids))
|
||||
.values(status=body.status)
|
||||
)
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
class MarkAllReadBody(BaseModel):
|
||||
feed_id: int | None = None
|
||||
category_id: int | None = None
|
||||
|
||||
|
||||
@router.put("/mark-all-read")
|
||||
async def mark_all_read(
|
||||
body: MarkAllReadBody,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
"""Mark ALL unread entries as read, optionally filtered by feed or category."""
|
||||
q = update(Entry).where(Entry.user_id == user_id, Entry.status == "unread")
|
||||
|
||||
if body.feed_id:
|
||||
q = q.where(Entry.feed_id == body.feed_id)
|
||||
elif body.category_id:
|
||||
from app.models import Feed
|
||||
feed_ids_q = select(Feed.id).where(Feed.category_id == body.category_id, Feed.user_id == user_id)
|
||||
q = q.where(Entry.feed_id.in_(feed_ids_q))
|
||||
|
||||
result = await db.execute(q.values(status="read"))
|
||||
await db.commit()
|
||||
return {"ok": True, "marked": result.rowcount}
|
||||
|
||||
|
||||
@router.get("/{entry_id}", response_model=EntryOut)
|
||||
async def get_entry(
|
||||
entry_id: int,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Entry)
|
||||
.options(selectinload(Entry.feed))
|
||||
.where(Entry.id == entry_id, Entry.user_id == user_id)
|
||||
)
|
||||
entry = result.scalar_one_or_none()
|
||||
if not entry:
|
||||
raise HTTPException(status_code=404, detail="Entry not found")
|
||||
return EntryOut.from_entry(entry)
|
||||
|
||||
|
||||
@router.put("/{entry_id}/bookmark")
|
||||
async def toggle_bookmark(
|
||||
entry_id: int,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Entry).where(Entry.id == entry_id, Entry.user_id == user_id)
|
||||
)
|
||||
entry = result.scalar_one_or_none()
|
||||
if not entry:
|
||||
raise HTTPException(status_code=404, detail="Entry not found")
|
||||
|
||||
entry.starred = not entry.starred
|
||||
await db.commit()
|
||||
return {"starred": entry.starred}
|
||||
|
||||
|
||||
@router.post("/{entry_id}/fetch-full-content", response_model=EntryOut)
|
||||
async def fetch_full_content(
|
||||
entry_id: int,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Entry)
|
||||
.options(selectinload(Entry.feed))
|
||||
.where(Entry.id == entry_id, Entry.user_id == user_id)
|
||||
)
|
||||
entry = result.scalar_one_or_none()
|
||||
if not entry:
|
||||
raise HTTPException(status_code=404, detail="Entry not found")
|
||||
|
||||
if not entry.url:
|
||||
raise HTTPException(status_code=400, detail="Entry has no URL to crawl")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
resp = await client.post(
|
||||
f"{CRAWLER_URL}/crawl",
|
||||
json={"url": entry.url},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except httpx.HTTPError as e:
|
||||
log.error("Crawler error for entry %d: %s", entry_id, e)
|
||||
raise HTTPException(status_code=502, detail="Failed to fetch full content")
|
||||
|
||||
# Prefer readable_html (Readability-extracted clean article with images)
|
||||
readable = data.get("readable_html", "")
|
||||
full_text = data.get("text", "")
|
||||
if readable:
|
||||
entry.full_content = readable
|
||||
elif full_text:
|
||||
paragraphs = [p.strip() for p in full_text.split("\n\n") if p.strip()]
|
||||
if not paragraphs:
|
||||
paragraphs = [p.strip() for p in full_text.split("\n") if p.strip()]
|
||||
entry.full_content = "\n".join(f"<p>{p}</p>" for p in paragraphs)
|
||||
else:
|
||||
entry.full_content = ""
|
||||
|
||||
# Recalculate reading time from plain text
|
||||
if full_text:
|
||||
word_count = len(full_text.split())
|
||||
entry.reading_time = max(1, word_count // 200)
|
||||
|
||||
await db.commit()
|
||||
await db.refresh(entry)
|
||||
return EntryOut.from_entry(entry)
|
||||
242
services/reader/app/api/feeds.py
Normal file
242
services/reader/app/api/feeds.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""Feed endpoints."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
import feedparser
|
||||
import httpx
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.api.deps import get_db_session, get_user_id
|
||||
from app.models import Category, Entry, Feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api/feeds", tags=["feeds"])
|
||||
|
||||
|
||||
# ── Schemas ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class CategoryRef(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class FeedOut(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
feed_url: str
|
||||
site_url: str | None = None
|
||||
category: CategoryRef | None = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class FeedCreate(BaseModel):
|
||||
feed_url: str
|
||||
category_id: int | None = None
|
||||
|
||||
|
||||
class CountersOut(BaseModel):
|
||||
unreads: dict[str, int]
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _discover_feed_url(html: str, base_url: str) -> str | None:
|
||||
"""Try to find an RSS/Atom feed link in HTML."""
|
||||
patterns = [
|
||||
r'<link[^>]+type=["\']application/(?:rss|atom)\+xml["\'][^>]+href=["\']([^"\']+)["\']',
|
||||
r'<link[^>]+href=["\']([^"\']+)["\'][^>]+type=["\']application/(?:rss|atom)\+xml["\']',
|
||||
]
|
||||
for pat in patterns:
|
||||
match = re.search(pat, html, re.IGNORECASE)
|
||||
if match:
|
||||
href = match.group(1)
|
||||
if href.startswith("/"):
|
||||
# Resolve relative URL
|
||||
from urllib.parse import urljoin
|
||||
href = urljoin(base_url, href)
|
||||
return href
|
||||
return None
|
||||
|
||||
|
||||
async def _fetch_and_parse_feed(feed_url: str) -> tuple[str, str, str | None]:
|
||||
"""
|
||||
Fetch a URL. If it's a valid feed, return (feed_url, title, site_url).
|
||||
If it's HTML, try to discover the feed link and follow it.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||
resp = await client.get(feed_url, headers={"User-Agent": "Reader/1.0"})
|
||||
resp.raise_for_status()
|
||||
|
||||
body = resp.text
|
||||
parsed = feedparser.parse(body)
|
||||
|
||||
# Check if it's a valid feed
|
||||
if parsed.feed.get("title") or parsed.entries:
|
||||
title = parsed.feed.get("title", feed_url)
|
||||
site_url = parsed.feed.get("link")
|
||||
return feed_url, title, site_url
|
||||
|
||||
# Not a feed — try to discover from HTML
|
||||
discovered = _discover_feed_url(body, feed_url)
|
||||
if not discovered:
|
||||
raise HTTPException(status_code=400, detail="No RSS/Atom feed found at this URL")
|
||||
|
||||
# Fetch the discovered feed
|
||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||
resp2 = await client.get(discovered, headers={"User-Agent": "Reader/1.0"})
|
||||
resp2.raise_for_status()
|
||||
|
||||
parsed2 = feedparser.parse(resp2.text)
|
||||
title = parsed2.feed.get("title", discovered)
|
||||
site_url = parsed2.feed.get("link") or feed_url
|
||||
return discovered, title, site_url
|
||||
|
||||
|
||||
# ── Routes ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@router.get("/counters", response_model=CountersOut)
|
||||
async def feed_counters(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Entry.feed_id, func.count(Entry.id))
|
||||
.where(Entry.user_id == user_id, Entry.status == "unread")
|
||||
.group_by(Entry.feed_id)
|
||||
)
|
||||
unreads = {str(row[0]): row[1] for row in result.all()}
|
||||
return {"unreads": unreads}
|
||||
|
||||
|
||||
@router.get("", response_model=list[FeedOut])
|
||||
async def list_feeds(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Feed)
|
||||
.where(Feed.user_id == user_id)
|
||||
.order_by(Feed.title)
|
||||
)
|
||||
return result.scalars().all()
|
||||
|
||||
|
||||
@router.post("", response_model=FeedOut, status_code=201)
|
||||
async def create_feed(
|
||||
body: FeedCreate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
# Check for duplicate
|
||||
existing = await db.execute(
|
||||
select(Feed).where(Feed.feed_url == body.feed_url)
|
||||
)
|
||||
if existing.scalar_one_or_none():
|
||||
raise HTTPException(status_code=409, detail="Feed already exists")
|
||||
|
||||
# Validate category belongs to user
|
||||
if body.category_id:
|
||||
cat = await db.execute(
|
||||
select(Category).where(
|
||||
Category.id == body.category_id,
|
||||
Category.user_id == user_id,
|
||||
)
|
||||
)
|
||||
if not cat.scalar_one_or_none():
|
||||
raise HTTPException(status_code=404, detail="Category not found")
|
||||
|
||||
# Fetch and discover feed
|
||||
try:
|
||||
actual_url, title, site_url = await _fetch_and_parse_feed(body.feed_url)
|
||||
except httpx.HTTPError as e:
|
||||
log.warning("Failed to fetch feed %s: %s", body.feed_url, e)
|
||||
raise HTTPException(status_code=400, detail=f"Could not fetch feed: {e}")
|
||||
|
||||
# Check again with discovered URL
|
||||
if actual_url != body.feed_url:
|
||||
existing = await db.execute(
|
||||
select(Feed).where(Feed.feed_url == actual_url)
|
||||
)
|
||||
if existing.scalar_one_or_none():
|
||||
raise HTTPException(status_code=409, detail="Feed already exists")
|
||||
|
||||
feed = Feed(
|
||||
user_id=user_id,
|
||||
category_id=body.category_id,
|
||||
title=title,
|
||||
feed_url=actual_url,
|
||||
site_url=site_url,
|
||||
)
|
||||
db.add(feed)
|
||||
await db.commit()
|
||||
await db.refresh(feed)
|
||||
return feed
|
||||
|
||||
|
||||
@router.delete("/{feed_id}", status_code=204)
|
||||
async def delete_feed(
|
||||
feed_id: int,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Feed).where(Feed.id == feed_id, Feed.user_id == user_id)
|
||||
)
|
||||
feed = result.scalar_one_or_none()
|
||||
if not feed:
|
||||
raise HTTPException(status_code=404, detail="Feed not found")
|
||||
await db.delete(feed)
|
||||
await db.commit()
|
||||
|
||||
|
||||
@router.post("/{feed_id}/refresh")
|
||||
async def refresh_feed(
|
||||
feed_id: int,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Feed).where(Feed.id == feed_id, Feed.user_id == user_id)
|
||||
)
|
||||
feed = result.scalar_one_or_none()
|
||||
if not feed:
|
||||
raise HTTPException(status_code=404, detail="Feed not found")
|
||||
|
||||
import asyncio
|
||||
from app.worker.tasks import fetch_single_feed
|
||||
await asyncio.to_thread(fetch_single_feed, feed_id)
|
||||
|
||||
return {"ok": True, "message": f"Refreshed {feed.title}"}
|
||||
|
||||
|
||||
@router.post("/refresh-all")
|
||||
async def refresh_all_feeds(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Feed).where(Feed.user_id == user_id)
|
||||
)
|
||||
feeds = result.scalars().all()
|
||||
|
||||
import asyncio
|
||||
from app.worker.tasks import fetch_single_feed
|
||||
for feed in feeds:
|
||||
try:
|
||||
await asyncio.to_thread(fetch_single_feed, feed.id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {"ok": True, "message": f"Refreshed {len(feeds)} feeds"}
|
||||
23
services/reader/app/config.py
Normal file
23
services/reader/app/config.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Reader service configuration — all from environment variables."""
|
||||
|
||||
import os
|
||||
|
||||
# ── Database (reuse Brain's PostgreSQL) ──
|
||||
DATABASE_URL = os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql+asyncpg://brain:brain@brain-db:5432/brain",
|
||||
)
|
||||
DATABASE_URL_SYNC = DATABASE_URL.replace("+asyncpg", "")
|
||||
|
||||
# ── Redis (reuse Brain's Redis) ──
|
||||
REDIS_URL = os.environ.get("REDIS_URL", "redis://brain-redis:6379/0")
|
||||
|
||||
# ── Crawler (reuse Brain's Playwright crawler) ──
|
||||
CRAWLER_URL = os.environ.get("CRAWLER_URL", "http://brain-crawler:3100")
|
||||
|
||||
# ── Service ──
|
||||
PORT = int(os.environ.get("PORT", "8300"))
|
||||
DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true")
|
||||
|
||||
# ── Feed fetch interval (seconds) ──
|
||||
FEED_FETCH_INTERVAL = int(os.environ.get("FEED_FETCH_INTERVAL", "600"))
|
||||
18
services/reader/app/database.py
Normal file
18
services/reader/app/database.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""Database session and engine setup."""
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
from app.config import DATABASE_URL
|
||||
|
||||
engine = create_async_engine(DATABASE_URL, echo=False, pool_size=10, max_overflow=5)
|
||||
async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
async def get_db() -> AsyncSession:
|
||||
async with async_session() as session:
|
||||
yield session
|
||||
43
services/reader/app/main.py
Normal file
43
services/reader/app/main.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Reader service — FastAPI entrypoint."""
|
||||
|
||||
import logging
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
from app.api.categories import router as categories_router
|
||||
from app.api.feeds import router as feeds_router
|
||||
from app.api.entries import router as entries_router
|
||||
from app.config import DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if DEBUG else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
app = FastAPI(
|
||||
title="Reader",
|
||||
description="Self-hosted RSS reader — replaces Miniflux.",
|
||||
version="1.0.0",
|
||||
docs_url="/api/docs" if DEBUG else None,
|
||||
redoc_url=None,
|
||||
)
|
||||
|
||||
app.include_router(categories_router)
|
||||
app.include_router(feeds_router)
|
||||
app.include_router(entries_router)
|
||||
|
||||
|
||||
@app.get("/api/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
from app.database import engine, Base
|
||||
from app.models import Category, Feed, Entry # noqa: register models
|
||||
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
logging.getLogger(__name__).info("Reader service started")
|
||||
74
services/reader/app/models.py
Normal file
74
services/reader/app/models.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""SQLAlchemy models for the reader service."""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import (
|
||||
Boolean,
|
||||
Column,
|
||||
DateTime,
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
)
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Category(Base):
|
||||
__tablename__ = "reader_categories"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id = Column(String(64), nullable=False)
|
||||
title = Column(String(255), nullable=False)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
feeds = relationship("Feed", back_populates="category", lazy="selectin")
|
||||
|
||||
|
||||
class Feed(Base):
|
||||
__tablename__ = "reader_feeds"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id = Column(String(64), nullable=False)
|
||||
category_id = Column(Integer, ForeignKey("reader_categories.id", ondelete="SET NULL"), nullable=True)
|
||||
title = Column(String(500), nullable=False)
|
||||
feed_url = Column(Text, nullable=False, unique=True)
|
||||
site_url = Column(Text)
|
||||
etag = Column(String(255))
|
||||
last_modified = Column(String(255))
|
||||
last_fetched_at = Column(DateTime)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
category = relationship("Category", back_populates="feeds", lazy="selectin")
|
||||
entries = relationship("Entry", back_populates="feed", lazy="noload", cascade="all, delete-orphan")
|
||||
|
||||
|
||||
class Entry(Base):
|
||||
__tablename__ = "reader_entries"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("feed_id", "url", name="uq_reader_entries_feed_url"),
|
||||
Index("idx_reader_entries_user_status", "user_id", "status"),
|
||||
Index("idx_reader_entries_user_starred", "user_id", "starred"),
|
||||
Index("idx_reader_entries_feed", "feed_id"),
|
||||
Index("idx_reader_entries_published", "published_at"),
|
||||
)
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
feed_id = Column(Integer, ForeignKey("reader_feeds.id", ondelete="CASCADE"), nullable=False)
|
||||
user_id = Column(String(64), nullable=False)
|
||||
title = Column(String(1000))
|
||||
url = Column(Text)
|
||||
content = Column(Text)
|
||||
full_content = Column(Text)
|
||||
author = Column(String(500))
|
||||
published_at = Column(DateTime)
|
||||
status = Column(String(10), default="unread")
|
||||
starred = Column(Boolean, default=False)
|
||||
reading_time = Column(Integer, default=1)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
feed = relationship("Feed", back_populates="entries", lazy="selectin")
|
||||
0
services/reader/app/worker/__init__.py
Normal file
0
services/reader/app/worker/__init__.py
Normal file
363
services/reader/app/worker/tasks.py
Normal file
363
services/reader/app/worker/tasks.py
Normal file
@@ -0,0 +1,363 @@
|
||||
"""Feed fetching worker — RQ tasks and scheduling loop."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import feedparser
|
||||
import httpx
|
||||
from dateutil import parser as dateparser
|
||||
from redis import Redis
|
||||
from rq import Queue
|
||||
from sqlalchemy import create_engine, select
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
from app.config import DATABASE_URL_SYNC, FEED_FETCH_INTERVAL, REDIS_URL
|
||||
from app.models import Category, Entry, Feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# ── Sync DB engine (for RQ worker) ──
|
||||
_engine = create_engine(DATABASE_URL_SYNC, echo=False, pool_size=5, max_overflow=3)
|
||||
SyncSession = sessionmaker(_engine, class_=Session, expire_on_commit=False)
|
||||
|
||||
# ── RQ queue ──
|
||||
_redis = Redis.from_url(REDIS_URL)
|
||||
queue = Queue("reader", connection=_redis)
|
||||
|
||||
# HTML tag stripper
|
||||
_html_re = re.compile(r"<[^>]+>")
|
||||
|
||||
|
||||
def _strip_html(text: str) -> str:
|
||||
"""Remove HTML tags for word counting."""
|
||||
if not text:
|
||||
return ""
|
||||
return _html_re.sub("", text)
|
||||
|
||||
|
||||
def _calc_reading_time(html_content: str) -> int:
|
||||
"""Estimate reading time in minutes from HTML content."""
|
||||
plain = _strip_html(html_content)
|
||||
word_count = len(plain.split())
|
||||
return max(1, word_count // 200)
|
||||
|
||||
|
||||
def _parse_date(entry: dict) -> datetime | None:
|
||||
"""Parse published date from a feedparser entry."""
|
||||
for field in ("published", "updated", "created"):
|
||||
val = entry.get(field)
|
||||
if val:
|
||||
try:
|
||||
return dateparser.parse(val)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
# Try struct_time fields
|
||||
for field in ("published_parsed", "updated_parsed", "created_parsed"):
|
||||
val = entry.get(field)
|
||||
if val:
|
||||
try:
|
||||
return datetime(*val[:6], tzinfo=timezone.utc)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _get_entry_content(entry: dict) -> str:
|
||||
"""Extract the best content from a feedparser entry."""
|
||||
# Prefer content field (often full HTML)
|
||||
if entry.get("content"):
|
||||
return entry["content"][0].get("value", "")
|
||||
# Fall back to summary
|
||||
if entry.get("summary"):
|
||||
return entry["summary"]
|
||||
# Fall back to description
|
||||
if entry.get("description"):
|
||||
return entry["description"]
|
||||
return ""
|
||||
|
||||
|
||||
def _get_entry_author(entry: dict) -> str | None:
|
||||
"""Extract author from a feedparser entry."""
|
||||
if entry.get("author"):
|
||||
return entry["author"]
|
||||
if entry.get("author_detail", {}).get("name"):
|
||||
return entry["author_detail"]["name"]
|
||||
return None
|
||||
|
||||
|
||||
def _ensure_uncategorized(db: Session, user_id: str) -> int:
|
||||
"""Ensure an 'Uncategorized' category exists for the user, return its ID."""
|
||||
row = db.execute(
|
||||
select(Category).where(
|
||||
Category.user_id == user_id,
|
||||
Category.title == "Uncategorized",
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
if row:
|
||||
return row.id
|
||||
cat = Category(user_id=user_id, title="Uncategorized")
|
||||
db.add(cat)
|
||||
db.flush()
|
||||
return cat.id
|
||||
|
||||
|
||||
def fetch_single_feed(feed_id: int):
|
||||
"""Fetch and parse a single feed, inserting new entries."""
|
||||
with SyncSession() as db:
|
||||
feed = db.execute(select(Feed).where(Feed.id == feed_id)).scalar_one_or_none()
|
||||
if not feed:
|
||||
log.warning("Feed %d not found, skipping", feed_id)
|
||||
return
|
||||
|
||||
log.info("Fetching feed %d: %s", feed.id, feed.feed_url)
|
||||
|
||||
headers = {"User-Agent": "Reader/1.0"}
|
||||
if feed.etag:
|
||||
headers["If-None-Match"] = feed.etag
|
||||
if feed.last_modified:
|
||||
headers["If-Modified-Since"] = feed.last_modified
|
||||
|
||||
try:
|
||||
resp = httpx.get(feed.feed_url, headers=headers, timeout=30, follow_redirects=True)
|
||||
except httpx.HTTPError as e:
|
||||
log.error("HTTP error fetching feed %d: %s", feed.id, e)
|
||||
return
|
||||
|
||||
# 304 Not Modified
|
||||
if resp.status_code == 304:
|
||||
log.debug("Feed %d not modified", feed.id)
|
||||
feed.last_fetched_at = datetime.utcnow()
|
||||
db.commit()
|
||||
return
|
||||
|
||||
if resp.status_code != 200:
|
||||
log.warning("Feed %d returned status %d", feed.id, resp.status_code)
|
||||
return
|
||||
|
||||
# Update etag/last-modified
|
||||
feed.etag = resp.headers.get("ETag")
|
||||
feed.last_modified = resp.headers.get("Last-Modified")
|
||||
feed.last_fetched_at = datetime.utcnow()
|
||||
|
||||
parsed = feedparser.parse(resp.text)
|
||||
if not parsed.entries:
|
||||
log.debug("Feed %d has no entries", feed.id)
|
||||
db.commit()
|
||||
return
|
||||
|
||||
new_count = 0
|
||||
new_entry_ids = []
|
||||
for fe in parsed.entries:
|
||||
url = fe.get("link")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
content = _get_entry_content(fe)
|
||||
pub_date = _parse_date(fe)
|
||||
|
||||
stmt = pg_insert(Entry).values(
|
||||
feed_id=feed.id,
|
||||
user_id=feed.user_id,
|
||||
title=fe.get("title", "")[:1000] if fe.get("title") else None,
|
||||
url=url,
|
||||
content=content,
|
||||
author=_get_entry_author(fe),
|
||||
published_at=pub_date,
|
||||
status="unread",
|
||||
starred=False,
|
||||
reading_time=_calc_reading_time(content),
|
||||
).on_conflict_do_nothing(
|
||||
constraint="uq_reader_entries_feed_url"
|
||||
).returning(Entry.id)
|
||||
result = db.execute(stmt)
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
new_entry_ids.append(row[0])
|
||||
new_count += 1
|
||||
|
||||
db.commit()
|
||||
log.info("Feed %d: %d new entries from %d total", feed.id, new_count, len(parsed.entries))
|
||||
|
||||
# Fetch full content for new entries
|
||||
if new_entry_ids:
|
||||
_fetch_full_content_for_entries(db, new_entry_ids)
|
||||
|
||||
|
||||
def _fetch_full_content_for_entries(db, entry_ids: list[int]):
|
||||
"""Fetch full article content for specific entries."""
|
||||
from app.config import CRAWLER_URL
|
||||
|
||||
entries = db.execute(
|
||||
select(Entry).where(Entry.id.in_(entry_ids))
|
||||
).scalars().all()
|
||||
|
||||
log.info("Fetching full content for %d new entries", len(entries))
|
||||
for entry in entries:
|
||||
if not entry.url:
|
||||
continue
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{CRAWLER_URL}/crawl",
|
||||
json={"url": entry.url},
|
||||
timeout=45,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
readable = data.get("readable_html", "")
|
||||
full_text = data.get("text", "")
|
||||
if readable:
|
||||
entry.full_content = readable
|
||||
if full_text:
|
||||
entry.reading_time = max(1, len(full_text.split()) // 200)
|
||||
elif full_text and len(full_text) > len(_strip_html(entry.content or "")):
|
||||
paragraphs = [p.strip() for p in full_text.split("\n\n") if p.strip()]
|
||||
if not paragraphs:
|
||||
paragraphs = [p.strip() for p in full_text.split("\n") if p.strip()]
|
||||
entry.full_content = "\n".join(f"<p>{p}</p>" for p in paragraphs)
|
||||
entry.reading_time = max(1, len(full_text.split()) // 200)
|
||||
else:
|
||||
entry.full_content = entry.content or ""
|
||||
else:
|
||||
entry.full_content = entry.content or ""
|
||||
except Exception as e:
|
||||
log.warning("Full content fetch failed for entry %d: %s", entry.id, e)
|
||||
entry.full_content = entry.content or ""
|
||||
|
||||
db.commit()
|
||||
log.info("Full content done for %d entries", len(entries))
|
||||
|
||||
|
||||
def fetch_full_content_batch():
|
||||
"""Fetch full article content for entries that only have RSS summaries."""
|
||||
from app.config import CRAWLER_URL
|
||||
|
||||
with SyncSession() as db:
|
||||
# Find entries with short content and no full_content (limit batch size)
|
||||
entries = db.execute(
|
||||
select(Entry).where(
|
||||
Entry.full_content.is_(None),
|
||||
Entry.url.isnot(None),
|
||||
Entry.status == "unread",
|
||||
).order_by(Entry.published_at.desc()).limit(20)
|
||||
).scalars().all()
|
||||
|
||||
if not entries:
|
||||
return
|
||||
|
||||
log.info("Fetching full content for %d entries", len(entries))
|
||||
for entry in entries:
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{CRAWLER_URL}/crawl",
|
||||
json={"url": entry.url},
|
||||
timeout=45,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
readable = data.get("readable_html", "")
|
||||
full_text = data.get("text", "")
|
||||
if readable:
|
||||
entry.full_content = readable
|
||||
if full_text:
|
||||
entry.reading_time = max(1, len(full_text.split()) // 200)
|
||||
elif full_text and len(full_text) > len(_strip_html(entry.content or "")):
|
||||
paragraphs = [p.strip() for p in full_text.split("\n\n") if p.strip()]
|
||||
if not paragraphs:
|
||||
paragraphs = [p.strip() for p in full_text.split("\n") if p.strip()]
|
||||
entry.full_content = "\n".join(f"<p>{p}</p>" for p in paragraphs)
|
||||
entry.reading_time = max(1, len(full_text.split()) // 200)
|
||||
else:
|
||||
entry.full_content = entry.content or ""
|
||||
else:
|
||||
entry.full_content = entry.content or ""
|
||||
except Exception as e:
|
||||
log.warning("Full content fetch failed for entry %d: %s", entry.id, e)
|
||||
entry.full_content = entry.content or ""
|
||||
|
||||
db.commit()
|
||||
log.info("Full content fetched for %d entries", len(entries))
|
||||
|
||||
|
||||
def fetch_all_feeds():
|
||||
"""Fetch all feeds — called on schedule."""
|
||||
with SyncSession() as db:
|
||||
feeds = db.execute(select(Feed)).scalars().all()
|
||||
|
||||
log.info("Scheduling fetch for %d feeds", len(feeds))
|
||||
for feed in feeds:
|
||||
try:
|
||||
fetch_single_feed(feed.id)
|
||||
except Exception:
|
||||
log.exception("Error fetching feed %d", feed.id)
|
||||
|
||||
# Full content is now fetched inline for each new entry
|
||||
|
||||
|
||||
def cleanup_old_entries():
|
||||
"""Delete old entries: read > 30 days, unread > 60 days."""
|
||||
from sqlalchemy import delete as sa_delete
|
||||
|
||||
with SyncSession() as db:
|
||||
now = datetime.utcnow()
|
||||
thirty_days = now - __import__('datetime').timedelta(days=30)
|
||||
sixty_days = now - __import__('datetime').timedelta(days=60)
|
||||
|
||||
# Read entries older than 30 days
|
||||
result1 = db.execute(
|
||||
sa_delete(Entry).where(
|
||||
Entry.status == "read",
|
||||
Entry.created_at < thirty_days,
|
||||
)
|
||||
)
|
||||
|
||||
# Unread entries older than 60 days
|
||||
result2 = db.execute(
|
||||
sa_delete(Entry).where(
|
||||
Entry.status == "unread",
|
||||
Entry.created_at < sixty_days,
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
total = (result1.rowcount or 0) + (result2.rowcount or 0)
|
||||
if total > 0:
|
||||
log.info("Cleanup: deleted %d old entries (%d read, %d unread)",
|
||||
total, result1.rowcount or 0, result2.rowcount or 0)
|
||||
|
||||
|
||||
def run_scheduler():
|
||||
"""Simple loop that runs fetch_all_feeds every FEED_FETCH_INTERVAL seconds."""
|
||||
log.info("Reader scheduler started — interval: %ds", FEED_FETCH_INTERVAL)
|
||||
|
||||
# Create tables on first run (for the sync engine)
|
||||
from app.database import Base
|
||||
from app.models import Category, Feed, Entry # noqa: register models
|
||||
Base.metadata.create_all(_engine)
|
||||
|
||||
cycles = 0
|
||||
while True:
|
||||
try:
|
||||
fetch_all_feeds()
|
||||
except Exception:
|
||||
log.exception("Scheduler error in fetch_all_feeds")
|
||||
|
||||
# Run cleanup once per day (every ~144 cycles at 10min interval)
|
||||
cycles += 1
|
||||
if cycles % 144 == 0:
|
||||
try:
|
||||
cleanup_old_entries()
|
||||
except Exception:
|
||||
log.exception("Scheduler error in cleanup")
|
||||
|
||||
time.sleep(FEED_FETCH_INTERVAL)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
run_scheduler()
|
||||
43
services/reader/docker-compose.yml
Normal file
43
services/reader/docker-compose.yml
Normal file
@@ -0,0 +1,43 @@
|
||||
services:
|
||||
# ── API ──
|
||||
reader-api:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.api
|
||||
container_name: reader-api
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- DATABASE_URL=postgresql+asyncpg://brain:brain@brain-db:5432/brain
|
||||
- REDIS_URL=redis://brain-redis:6379/0
|
||||
- CRAWLER_URL=http://brain-crawler:3100
|
||||
- PORT=8300
|
||||
- DEBUG=${DEBUG:-0}
|
||||
- TZ=${TZ:-America/Chicago}
|
||||
networks:
|
||||
- default
|
||||
- pangolin
|
||||
- brain
|
||||
|
||||
# ── Worker (feed fetcher + scheduler) ──
|
||||
reader-worker:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.worker
|
||||
container_name: reader-worker
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- DATABASE_URL=postgresql+asyncpg://brain:brain@brain-db:5432/brain
|
||||
- REDIS_URL=redis://brain-redis:6379/0
|
||||
- CRAWLER_URL=http://brain-crawler:3100
|
||||
- FEED_FETCH_INTERVAL=600
|
||||
- TZ=${TZ:-America/Chicago}
|
||||
networks:
|
||||
- default
|
||||
- brain
|
||||
|
||||
networks:
|
||||
pangolin:
|
||||
external: true
|
||||
brain:
|
||||
name: brain_default
|
||||
external: true
|
||||
11
services/reader/requirements.txt
Normal file
11
services/reader/requirements.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.32.0
|
||||
sqlalchemy[asyncio]==2.0.35
|
||||
asyncpg==0.30.0
|
||||
psycopg2-binary==2.9.10
|
||||
pydantic==2.10.0
|
||||
httpx==0.28.0
|
||||
feedparser==6.0.11
|
||||
redis==5.2.0
|
||||
rq==2.1.0
|
||||
python-dateutil==2.9.0
|
||||
Reference in New Issue
Block a user