feat: thumbnail extraction for Reader — fixes all clients
Server-side (dashboard + iOS + any client): - Added thumbnail column to reader_entries - Worker extracts from media:thumbnail, media:content, enclosures, HTML img - API returns thumbnail in EntryOut with & decoding - Backfilled 260 existing entries iOS: - Prefers API thumbnail, falls back to client-side extraction - Decodes HTML entities in URLs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,7 @@ class EntryOut(BaseModel):
|
||||
status: str = "unread"
|
||||
starred: bool = False
|
||||
reading_time: int = 1
|
||||
thumbnail: str | None = None
|
||||
feed: FeedRef | None = None
|
||||
|
||||
class Config:
|
||||
@@ -49,6 +50,10 @@ class EntryOut(BaseModel):
|
||||
def from_entry(cls, entry: Entry) -> "EntryOut":
|
||||
# Use full_content if available, otherwise RSS content
|
||||
best_content = entry.full_content if entry.full_content else entry.content
|
||||
# Extract thumbnail from stored field, or from content
|
||||
thumb = entry.thumbnail
|
||||
if not thumb:
|
||||
thumb = cls._extract_thumbnail(entry.content or entry.full_content or "")
|
||||
return cls(
|
||||
id=entry.id,
|
||||
title=entry.title,
|
||||
@@ -60,9 +65,25 @@ class EntryOut(BaseModel):
|
||||
status=entry.status,
|
||||
starred=entry.starred,
|
||||
reading_time=entry.reading_time,
|
||||
thumbnail=thumb,
|
||||
feed=FeedRef(id=entry.feed.id, title=entry.feed.title) if entry.feed else None,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_thumbnail(html: str) -> str | None:
|
||||
"""Extract first image URL from HTML content."""
|
||||
if not html:
|
||||
return None
|
||||
import re
|
||||
match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', html[:3000], re.IGNORECASE)
|
||||
if match:
|
||||
url = match.group(1).replace("&", "&")
|
||||
# Skip tiny tracking pixels and icons
|
||||
if any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
|
||||
return None
|
||||
return url
|
||||
return None
|
||||
|
||||
|
||||
class EntryListOut(BaseModel):
|
||||
total: int
|
||||
|
||||
@@ -68,6 +68,7 @@ class Entry(Base):
|
||||
published_at = Column(DateTime)
|
||||
status = Column(String(10), default="unread")
|
||||
starred = Column(Boolean, default=False)
|
||||
thumbnail = Column(Text)
|
||||
reading_time = Column(Integer, default=1)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
|
||||
@@ -79,6 +79,31 @@ def _get_entry_content(entry: dict) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_thumbnail(entry: dict, content: str) -> str | None:
|
||||
"""Extract thumbnail from feedparser entry or content HTML."""
|
||||
# 1. Check media:thumbnail
|
||||
for mt in entry.get("media_thumbnail", []):
|
||||
if mt.get("url"):
|
||||
return mt["url"]
|
||||
# 2. Check media:content with image type
|
||||
for mc in entry.get("media_content", []):
|
||||
if mc.get("medium") == "image" or (mc.get("type", "").startswith("image")):
|
||||
if mc.get("url"):
|
||||
return mc["url"]
|
||||
# 3. Check enclosures
|
||||
for enc in entry.get("enclosures", []):
|
||||
if enc.get("type", "").startswith("image") and enc.get("href"):
|
||||
return enc["href"]
|
||||
# 4. Extract from content HTML
|
||||
if content:
|
||||
match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', content[:3000], re.IGNORECASE)
|
||||
if match:
|
||||
url = match.group(1).replace("&", "&")
|
||||
if not any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
|
||||
return url
|
||||
return None
|
||||
|
||||
|
||||
def _get_entry_author(entry: dict) -> str | None:
|
||||
"""Extract author from a feedparser entry."""
|
||||
if entry.get("author"):
|
||||
@@ -157,6 +182,7 @@ def fetch_single_feed(feed_id: int):
|
||||
|
||||
content = _get_entry_content(fe)
|
||||
pub_date = _parse_date(fe)
|
||||
thumb = _extract_thumbnail(fe, content)
|
||||
|
||||
stmt = pg_insert(Entry).values(
|
||||
feed_id=feed.id,
|
||||
@@ -168,6 +194,7 @@ def fetch_single_feed(feed_id: int):
|
||||
published_at=pub_date,
|
||||
status="unread",
|
||||
starred=False,
|
||||
thumbnail=thumb,
|
||||
reading_time=_calc_reading_time(content),
|
||||
).on_conflict_do_nothing(
|
||||
constraint="uq_reader_entries_feed_url"
|
||||
|
||||
Reference in New Issue
Block a user