# syntax=docker/dockerfile:1.7
FROM python:3.12-slim

ENV PYTHONUNBUFFERED=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_NO_CACHE_DIR=1 \
    HF_HOME=/root/.cache/huggingface

# System deps for the ingest pipeline:
#   ffmpeg          — Whisper audio decode
#   curl, ca-certs  — HTTP fetch
#   git             — pip installs from git refs (if any)
#   nodejs, npm     — defuddle CLI runs via `npx --yes defuddle`
RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        curl \
        ffmpeg \
        git \
        nodejs \
        npm \
    && rm -rf /var/lib/apt/lists/*

# Pre-fetch the defuddle CLI so the first ingest doesn't pay an npm install cost.
RUN npm install -g defuddle-cli || npm install -g defuddle

WORKDIR /app

# PyTorch CPU wheel — same constraint as indexer; whisper depends on it.
RUN pip install --index-url https://download.pytorch.org/whl/cpu \
        "torch>=2.3"

# kv_indexer is a dep of kv_ingest. Copy and install it first.
COPY indexer/pyproject.toml indexer/README.md ./indexer/
COPY indexer/src/ ./indexer/src/
RUN pip install -e ./indexer

# kv_ingest itself.
COPY ingest/pyproject.toml ingest/README.md ./ingest/
COPY ingest/src/ ./ingest/src/
RUN pip install -e "./ingest[pdf]"

WORKDIR /app/ingest

CMD ["kv-ingest-cron"]
