diff --git a/services/brain/Dockerfile.api b/services/brain/Dockerfile.api index e4446ce..685d2c8 100644 --- a/services/brain/Dockerfile.api +++ b/services/brain/Dockerfile.api @@ -2,7 +2,7 @@ FROM python:3.12-slim WORKDIR /app -RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev tesseract-ocr tesseract-ocr-eng && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir --upgrade pip COPY requirements.txt . diff --git a/services/brain/Dockerfile.worker b/services/brain/Dockerfile.worker index e0ebe0b..21be682 100644 --- a/services/brain/Dockerfile.worker +++ b/services/brain/Dockerfile.worker @@ -2,7 +2,7 @@ FROM python:3.12-slim WORKDIR /app -RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev tesseract-ocr tesseract-ocr-eng && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir --upgrade pip COPY requirements.txt . diff --git a/services/brain/app/services/extract.py b/services/brain/app/services/extract.py new file mode 100644 index 0000000..2a97eca --- /dev/null +++ b/services/brain/app/services/extract.py @@ -0,0 +1,202 @@ +"""Document and image text extraction — PDF parsing, OCR, vision API.""" + +import io +import logging +from pathlib import Path + +import httpx + +from app.config import OPENAI_API_KEY, OPENAI_MODEL + +log = logging.getLogger(__name__) + + +def extract_pdf_text(pdf_bytes: bytes) -> dict: + """Extract text from a PDF. Returns {text, page_count, needs_ocr, pages_text}.""" + import fitz # pymupdf + + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + pages_text = [] + full_text = [] + + for page in doc: + text = page.get_text("text").strip() + pages_text.append(text) + full_text.append(text) + + doc.close() + combined = "\n\n".join(full_text).strip() + + # If very little text extracted, it's probably a scanned PDF + needs_ocr = len(combined) < 50 and len(pages_text) > 0 + + return { + "text": combined, + "page_count": len(pages_text), + "needs_ocr": needs_ocr, + "pages_text": pages_text, + } + + +def render_pdf_first_page(pdf_bytes: bytes) -> bytes | None: + """Render the first page of a PDF as a PNG image. Returns PNG bytes.""" + import fitz + + try: + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + if len(doc) == 0: + return None + page = doc[0] + # Render at 2x resolution for quality + mat = fitz.Matrix(2, 2) + pix = page.get_pixmap(matrix=mat) + png_bytes = pix.tobytes("png") + doc.close() + return png_bytes + except Exception as e: + log.error(f"Failed to render PDF page: {e}") + return None + + +def render_pdf_pages(pdf_bytes: bytes, max_pages: int = 5) -> list[bytes]: + """Render multiple PDF pages as PNG images.""" + import fitz + + images = [] + try: + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + for i, page in enumerate(doc): + if i >= max_pages: + break + mat = fitz.Matrix(2, 2) + pix = page.get_pixmap(matrix=mat) + images.append(pix.tobytes("png")) + doc.close() + except Exception as e: + log.error(f"Failed to render PDF pages: {e}") + return images + + +def ocr_image(image_bytes: bytes) -> str: + """Run Tesseract OCR on an image. Returns extracted text.""" + import pytesseract + from PIL import Image + + try: + img = Image.open(io.BytesIO(image_bytes)) + text = pytesseract.image_to_string(img) + return text.strip() + except Exception as e: + log.error(f"OCR failed: {e}") + return "" + + +def ocr_pdf(pdf_bytes: bytes) -> str: + """OCR a scanned PDF by rendering pages to images then running Tesseract.""" + pages = render_pdf_pages(pdf_bytes, max_pages=10) + all_text = [] + for i, page_img in enumerate(pages): + log.info(f"OCR page {i + 1}/{len(pages)}") + text = ocr_image(page_img) + if text: + all_text.append(text) + return "\n\n".join(all_text) + + +async def describe_image_with_vision(image_bytes: bytes, content_type: str = "image/png") -> str: + """Use OpenAI Vision API to describe an image. Returns description text.""" + if not OPENAI_API_KEY: + log.warning("No OPENAI_API_KEY, skipping vision description") + return "" + + import base64 + b64 = base64.b64encode(image_bytes).decode("utf-8") + + try: + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + "https://api.openai.com/v1/chat/completions", + headers={"Authorization": f"Bearer {OPENAI_API_KEY}"}, + json={ + "model": OPENAI_MODEL, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Describe this image in detail. Extract ALL text visible in the image. If it's a document, receipt, form, or screenshot, transcribe the text content. If it's a photo, describe what you see." + }, + { + "type": "image_url", + "image_url": { + "url": f"data:{content_type};base64,{b64}", + "detail": "high" + } + } + ] + } + ], + "max_tokens": 2000, + }, + ) + resp.raise_for_status() + data = resp.json() + return data["choices"][0]["message"]["content"].strip() + except Exception as e: + log.error(f"Vision API failed: {e}") + return "" + + +def extract_text_from_file(file_bytes: bytes, content_type: str, filename: str) -> dict: + """Extract text from any supported file type. + + Returns {text, method, page_count, screenshot_png}. + """ + ct = content_type.lower() + fname = filename.lower() + + # PDF + if ct == "application/pdf" or fname.endswith(".pdf"): + result = extract_pdf_text(file_bytes) + text = result["text"] + method = "pdf_text" + + # If scanned (no text), try OCR + if result["needs_ocr"]: + log.info("PDF has no selectable text, running OCR...") + text = ocr_pdf(file_bytes) + method = "pdf_ocr" + + screenshot = render_pdf_first_page(file_bytes) + + return { + "text": text, + "method": method, + "page_count": result["page_count"], + "screenshot_png": screenshot, + } + + # Images + if ct.startswith("image/"): + # Try OCR first + ocr_text = ocr_image(file_bytes) + method = "image_ocr" + + return { + "text": ocr_text, + "method": method, + "page_count": None, + "screenshot_png": None, # The image itself is the "screenshot" + } + + # Plain text files + if ct.startswith("text/") or fname.endswith((".txt", ".md", ".csv")): + try: + text = file_bytes.decode("utf-8") + return {"text": text, "method": "text_decode", "page_count": None, "screenshot_png": None} + except UnicodeDecodeError: + return {"text": "", "method": "decode_failed", "page_count": None, "screenshot_png": None} + + # Unsupported + return {"text": "", "method": "unsupported", "page_count": None, "screenshot_png": None} diff --git a/services/brain/app/worker/tasks.py b/services/brain/app/worker/tasks.py index 36585ec..750fc82 100644 --- a/services/brain/app/worker/tasks.py +++ b/services/brain/app/worker/tasks.py @@ -99,6 +99,66 @@ async def _process_item(item_id: str): ) db.add(asset) + # ── Step 1b: Process uploaded files (PDF, image, document) ── + if item.type in ("pdf", "image", "document", "file"): + from app.services.extract import extract_text_from_file, describe_image_with_vision + from app.services.storage import storage as file_storage + + # Find the original upload asset + upload_asset = None + for a in item.assets: + if a.asset_type == "original_upload": + upload_asset = a + break + + if upload_asset and file_storage.exists(upload_asset.storage_path): + log.info(f"Extracting text from {item.type}: {upload_asset.filename}") + file_bytes = file_storage.read(upload_asset.storage_path) + result = extract_text_from_file( + file_bytes, + upload_asset.content_type or "application/octet-stream", + upload_asset.filename, + ) + + if result["text"]: + extracted_text = result["text"] + log.info(f"Extracted {len(extracted_text)} chars via {result['method']}") + + # Save PDF screenshot as an asset + if result.get("screenshot_png"): + from app.services.storage import storage + path = storage.save( + item_id=item.id, + asset_type="screenshot", + filename="screenshot.png", + data=result["screenshot_png"], + ) + asset = ItemAsset( + id=str(uuid.uuid4()), + item_id=item.id, + asset_type="screenshot", + filename="screenshot.png", + content_type="image/png", + storage_path=path, + ) + db.add(asset) + + # For images with little OCR text, try vision API for description + if item.type == "image" and len(extracted_text) < 50: + log.info("Image has little OCR text, trying vision API...") + vision_text = await describe_image_with_vision( + file_bytes, + upload_asset.content_type or "image/png", + ) + if vision_text: + extracted_text = vision_text + log.info(f"Vision API returned {len(vision_text)} chars") + + item.metadata_json = item.metadata_json or {} + item.metadata_json["extraction_method"] = result["method"] + if result.get("page_count"): + item.metadata_json["page_count"] = result["page_count"] + # ── Step 2: AI classification ── log.info(f"Classifying item {item.id}") classification = await classify_item( diff --git a/services/brain/requirements.txt b/services/brain/requirements.txt index fbf499b..fcdd129 100644 --- a/services/brain/requirements.txt +++ b/services/brain/requirements.txt @@ -9,3 +9,6 @@ rq==2.1.0 httpx==0.28.1 pydantic==2.10.4 python-multipart==0.0.20 +pymupdf==1.25.3 +pytesseract==0.3.13 +Pillow==11.1.0 diff --git a/services/brain/storage/a66d0767-a59e-4ede-878d-f74441e42830/original_upload/test-insurance.pdf b/services/brain/storage/a66d0767-a59e-4ede-878d-f74441e42830/original_upload/test-insurance.pdf new file mode 100644 index 0000000..745458d --- /dev/null +++ b/services/brain/storage/a66d0767-a59e-4ede-878d-f74441e42830/original_upload/test-insurance.pdf @@ -0,0 +1,19 @@ +%PDF-1.0 +1 0 obj<>endobj +2 0 obj<>endobj +3 0 obj<>>>/Contents 5 0 R>>endobj +4 0 obj<>endobj +5 0 obj<> +stream +BT /F1 18 Tf 72 700 Td (State Farm Insurance Policy) Tj ET +BT /F1 12 Tf 72 670 Td (Policy Number: SF-2024-881234) Tj ET +BT /F1 12 Tf 72 650 Td (Policyholder: Yusuf Suleman) Tj ET +BT /F1 12 Tf 72 630 Td (Deductible: 500 dollars) Tj ET +endstream +endobj +xref +0 6 +trailer<> +startxref +0 +%%EOF \ No newline at end of file