feat: brain PDF/image text extraction — pymupdf + tesseract OCR + vision API
- PDF: extracts selectable text via pymupdf, falls back to Tesseract OCR for scanned docs - PDF: renders first page as screenshot thumbnail - Images: Tesseract OCR for text extraction, OpenAI vision API fallback for photos - Plain text files: direct decode - All extracted text stored in extracted_text field for search/embedding - Tested: PDF upload → text extracted → AI classified → searchable New deps: pymupdf, pytesseract, Pillow System dep: tesseract-ocr added to both Dockerfiles Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -99,6 +99,66 @@ async def _process_item(item_id: str):
|
||||
)
|
||||
db.add(asset)
|
||||
|
||||
# ── Step 1b: Process uploaded files (PDF, image, document) ──
|
||||
if item.type in ("pdf", "image", "document", "file"):
|
||||
from app.services.extract import extract_text_from_file, describe_image_with_vision
|
||||
from app.services.storage import storage as file_storage
|
||||
|
||||
# Find the original upload asset
|
||||
upload_asset = None
|
||||
for a in item.assets:
|
||||
if a.asset_type == "original_upload":
|
||||
upload_asset = a
|
||||
break
|
||||
|
||||
if upload_asset and file_storage.exists(upload_asset.storage_path):
|
||||
log.info(f"Extracting text from {item.type}: {upload_asset.filename}")
|
||||
file_bytes = file_storage.read(upload_asset.storage_path)
|
||||
result = extract_text_from_file(
|
||||
file_bytes,
|
||||
upload_asset.content_type or "application/octet-stream",
|
||||
upload_asset.filename,
|
||||
)
|
||||
|
||||
if result["text"]:
|
||||
extracted_text = result["text"]
|
||||
log.info(f"Extracted {len(extracted_text)} chars via {result['method']}")
|
||||
|
||||
# Save PDF screenshot as an asset
|
||||
if result.get("screenshot_png"):
|
||||
from app.services.storage import storage
|
||||
path = storage.save(
|
||||
item_id=item.id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
data=result["screenshot_png"],
|
||||
)
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
content_type="image/png",
|
||||
storage_path=path,
|
||||
)
|
||||
db.add(asset)
|
||||
|
||||
# For images with little OCR text, try vision API for description
|
||||
if item.type == "image" and len(extracted_text) < 50:
|
||||
log.info("Image has little OCR text, trying vision API...")
|
||||
vision_text = await describe_image_with_vision(
|
||||
file_bytes,
|
||||
upload_asset.content_type or "image/png",
|
||||
)
|
||||
if vision_text:
|
||||
extracted_text = vision_text
|
||||
log.info(f"Vision API returned {len(vision_text)} chars")
|
||||
|
||||
item.metadata_json = item.metadata_json or {}
|
||||
item.metadata_json["extraction_method"] = result["method"]
|
||||
if result.get("page_count"):
|
||||
item.metadata_json["page_count"] = result["page_count"]
|
||||
|
||||
# ── Step 2: AI classification ──
|
||||
log.info(f"Classifying item {item.id}")
|
||||
classification = await classify_item(
|
||||
|
||||
Reference in New Issue
Block a user