From a3eabf3e3b80b34aa084ad525eab52c1b6946dec Mon Sep 17 00:00:00 2001 From: Yusuf Suleman Date: Fri, 3 Apr 2026 19:32:47 -0500 Subject: [PATCH] =?UTF-8?q?feat:=20thumbnail=20extraction=20for=20Reader?= =?UTF-8?q?=20=E2=80=94=20fixes=20all=20clients?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server-side (dashboard + iOS + any client): - Added thumbnail column to reader_entries - Worker extracts from media:thumbnail, media:content, enclosures, HTML img - API returns thumbnail in EntryOut with & decoding - Backfilled 260 existing entries iOS: - Prefers API thumbnail, falls back to client-side extraction - Decodes HTML entities in URLs Co-Authored-By: Claude Opus 4.6 (1M context) --- .../Features/Reader/Models/ReaderModels.swift | 11 +++++++- services/reader/app/api/entries.py | 21 +++++++++++++++ services/reader/app/models.py | 1 + services/reader/app/worker/tasks.py | 27 +++++++++++++++++++ 4 files changed, 59 insertions(+), 1 deletion(-) diff --git a/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift b/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift index 699d877..167d5f2 100644 --- a/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift +++ b/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift @@ -18,6 +18,7 @@ struct ReaderEntry: Codable, Identifiable, Hashable { let status: String let starred: Bool let readingTime: Int + let thumbnail: String? let feed: ReaderFeedRef? var isRead: Bool { status == "read" } @@ -49,7 +50,12 @@ struct ReaderEntry: Codable, Identifiable, Hashable { } var thumbnailURL: URL? { - ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "") + // Prefer server-provided thumbnail + if let thumb = thumbnail, !thumb.isEmpty, let url = URL(string: thumb) { + return url + } + // Fallback: extract from content + return ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "") } private static let imgRegex = try! NSRegularExpression( @@ -68,6 +74,9 @@ struct ReaderEntry: Codable, Identifiable, Hashable { return nil } let src = String(searchRange[srcRange]) + .replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "<", with: "<") + .replacingOccurrences(of: ">", with: ">") return URL(string: src) } diff --git a/services/reader/app/api/entries.py b/services/reader/app/api/entries.py index b408869..be3ea9e 100644 --- a/services/reader/app/api/entries.py +++ b/services/reader/app/api/entries.py @@ -40,6 +40,7 @@ class EntryOut(BaseModel): status: str = "unread" starred: bool = False reading_time: int = 1 + thumbnail: str | None = None feed: FeedRef | None = None class Config: @@ -49,6 +50,10 @@ class EntryOut(BaseModel): def from_entry(cls, entry: Entry) -> "EntryOut": # Use full_content if available, otherwise RSS content best_content = entry.full_content if entry.full_content else entry.content + # Extract thumbnail from stored field, or from content + thumb = entry.thumbnail + if not thumb: + thumb = cls._extract_thumbnail(entry.content or entry.full_content or "") return cls( id=entry.id, title=entry.title, @@ -60,9 +65,25 @@ class EntryOut(BaseModel): status=entry.status, starred=entry.starred, reading_time=entry.reading_time, + thumbnail=thumb, feed=FeedRef(id=entry.feed.id, title=entry.feed.title) if entry.feed else None, ) + @staticmethod + def _extract_thumbnail(html: str) -> str | None: + """Extract first image URL from HTML content.""" + if not html: + return None + import re + match = re.search(r']+src=["\']([^"\']+)["\']', html[:3000], re.IGNORECASE) + if match: + url = match.group(1).replace("&", "&") + # Skip tiny tracking pixels and icons + if any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]): + return None + return url + return None + class EntryListOut(BaseModel): total: int diff --git a/services/reader/app/models.py b/services/reader/app/models.py index 2aa6f35..9b9258e 100644 --- a/services/reader/app/models.py +++ b/services/reader/app/models.py @@ -68,6 +68,7 @@ class Entry(Base): published_at = Column(DateTime) status = Column(String(10), default="unread") starred = Column(Boolean, default=False) + thumbnail = Column(Text) reading_time = Column(Integer, default=1) created_at = Column(DateTime, default=datetime.utcnow) diff --git a/services/reader/app/worker/tasks.py b/services/reader/app/worker/tasks.py index ea92652..cb3af37 100644 --- a/services/reader/app/worker/tasks.py +++ b/services/reader/app/worker/tasks.py @@ -79,6 +79,31 @@ def _get_entry_content(entry: dict) -> str: return "" +def _extract_thumbnail(entry: dict, content: str) -> str | None: + """Extract thumbnail from feedparser entry or content HTML.""" + # 1. Check media:thumbnail + for mt in entry.get("media_thumbnail", []): + if mt.get("url"): + return mt["url"] + # 2. Check media:content with image type + for mc in entry.get("media_content", []): + if mc.get("medium") == "image" or (mc.get("type", "").startswith("image")): + if mc.get("url"): + return mc["url"] + # 3. Check enclosures + for enc in entry.get("enclosures", []): + if enc.get("type", "").startswith("image") and enc.get("href"): + return enc["href"] + # 4. Extract from content HTML + if content: + match = re.search(r']+src=["\']([^"\']+)["\']', content[:3000], re.IGNORECASE) + if match: + url = match.group(1).replace("&", "&") + if not any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]): + return url + return None + + def _get_entry_author(entry: dict) -> str | None: """Extract author from a feedparser entry.""" if entry.get("author"): @@ -157,6 +182,7 @@ def fetch_single_feed(feed_id: int): content = _get_entry_content(fe) pub_date = _parse_date(fe) + thumb = _extract_thumbnail(fe, content) stmt = pg_insert(Entry).values( feed_id=feed.id, @@ -168,6 +194,7 @@ def fetch_single_feed(feed_id: int): published_at=pub_date, status="unread", starred=False, + thumbnail=thumb, reading_time=_calc_reading_time(content), ).on_conflict_do_nothing( constraint="uq_reader_entries_feed_url"