openapi: "3.1.0"
info:
  title: "Cargoffer OCR API"
  version: "0.5.0"
  description: |
    ## AI-First Document Intelligence API

    Extract structured data from logistics documents (PDF invoices, XLSX price lists,
    EML emails, images) using LLM vision models (Gemini, DeepSeek, GLM).

    ### Authentication
    All API endpoints (except /health and /api/signup) require a Bearer token.
    Get your API key via `POST /api/signup`.

    ### Quick Start
    1. `POST /api/signup` with your email → receives API key
    2. `POST /api/upload` with `Authorization: Bearer {key}` → receives `job_name`
    3. `POST /api/analyze/{job_name}` → receives structured JSON

    ### Deduplication
    Same document content (SHA-256 hash) = cached result across users.
    Soft delete preserves cache for other users.

  contact:
    name: "Cargoffer"
    url: "https://cargoffer.com"
    email: "cto@cargoffer.com"
  license:
    name: "Proprietary"

servers:
  - url: "https://ocr.cargoffer.com"
    description: "Production server"
  - url: "http://localhost:9090"
    description: "Local development"

paths:
  /health:
    get:
      operationId: healthCheck
      summary: "Health check"
      description: "Returns API status and available models count."
      tags: [System]
      responses:
        "200":
          description: "API is healthy"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/HealthResponse"
              example:
                status: "ok"
                models: ["gemini-3.5-flash", "deepseek-v4-flash", "glm-5v-turbo"]
                jobs: 3

  /api/signup:
    post:
      operationId: signup
      summary: "Create account"
      description: "Create a new user account. Returns API key — show only once."
      tags: [Authentication, Users]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/SignupRequest"
            example:
              email: "user@example.com"
              plan: "free"
      responses:
        "200":
          description: "Account created successfully"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/SignupResponse"
        "409":
          description: "Email already registered"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"
              example:
                detail: "Email already registered"

  /api/models:
    get:
      operationId: listModels
      summary: "List available models"
      description: "Returns list of available LLM models for document extraction."
      tags: [System, Models]
      responses:
        "200":
          description: "Model list"
          content:
            application/json:
              schema:
                type: array
                items:
                  $ref: "#/components/schemas/ModelInfo"
              example:
                - id: "gemini-3.5-flash"
                  name: "Gemini 3.5 Flash"
                  supports_pdf: true
                - id: "deepseek-v4-flash"
                  name: "DeepSeek V4 Flash"
                  supports_pdf: false

  /api/models/debug:
    get:
      operationId: listModelsDebug
      summary: "List models with debug info"
      description: "Internal endpoint. Returns full model details including provider."
      tags: [System]
      responses:
        "200":
          description: "Debug model info"

  /api/upload:
    post:
      operationId: uploadDocument
      summary: "Upload document"
      description: |
        Upload a document for extraction. Returns a `job_name` used in subsequent
        `/api/analyze/{job_name}` calls.

        **Supported formats:**
        - PDF (invoices, CMRs, delivery notes)
        - XLSX (price lists, tariffs)
        - EML (emails with embedded documents)
        - PNG, JPG (scanned documents, photos)
      tags: [Documents, Extraction]
      security:
        - BearerAuth: []
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: "Document file (PDF, XLSX, EML, PNG, JPG)"
      responses:
        "200":
          description: "Document uploaded successfully"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UploadResponse"
              example:
                job: "factura_repso_9b1632d0"
                filename: "factura_repso.pdf"
                size: 1458273
                status: "uploaded"
                doc_type: "invoice"
                recommended_model: "gemini-3.5-flash"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "400":
          description: "Invalid file or extension"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"

  /api/analyze/{job_name}:
    post:
      operationId: analyzeDocument
      summary: "Extract structured data"
      description: |
        Process an uploaded document and extract structured data using LLM vision models.

        The system auto-selects the best model:
        - **PDF**: Gemini 3.5 Flash (native PDF vision)
        - **XLSX/EML**: DeepSeek V4 Flash (fast on structured data)
        - **Images**: Gemini 3.5 Flash (best vision quality)

        Falls back to alternative model if primary fails.

        **Processing pipeline:**
        1. Document preparation (text extraction / image conversion)
        2. LLM extraction with document-type-specific prompt
        3. Schema validation via Pydantic models
        4. Verification pass (cross-check data against source)
      tags: [Documents, Extraction]
      security:
        - BearerAuth: []
      parameters:
        - name: job_name
          in: path
          required: true
          schema:
            type: string
            pattern: "^[a-zA-Z0-9_-]+$"
          description: "Job name returned from /api/upload"
        - name: model_id
          in: query
          required: false
          schema:
            type: string
            default: "auto"
            enum:
              - "auto"
              - "gemini-3.5-flash"
              - "gemini-3.1-pro"
              - "deepseek-v4-flash"
              - "deepseek-v4-pro"
              - "glm-5v-turbo"
          description: "Override auto-selected model. Default: auto"
      responses:
        "200":
          description: "Extraction completed"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/AnalyzeResponse"
              example:
                job: "factura_repso_9b1632d0"
                filename: "factura_repso.pdf"
                results:
                  gemini-3.5-flash:
                    document_type: "invoice"
                    provider: "Solred SA"
                    provider_nif: "A79707345"
                    invoice_number: "SD260032184"
                    period_from: "01/03/2026"
                    period_to: "31/03/2026"
                    issue_date: "31/03/2026"
                    due_date: "30/04/2026"
                    customer:
                      name: "TRANSPORTES ANDALUCES SL"
                      nif: "ESB41076589"
                    totals:
                      total_base: 223389.71
                      total_iva: 39863.73
                      total: 263384.78
                      total_with_other: 264791.96
                    line_items:
                      - concept: "DIESEL E+ NEOTECH (L)"
                        quantity: 112407.45
                        base: 168334.17
                        iva_pct: 21
                        iva: 35350.18
                        total: 203684.35
                    card_count: 65
                    fuel_total_liters: 160217.79
                    fuel_total_amount: 292960.76
                    _stages:
                      - stage: "preparando"
                        detail: "Documento: original.pdf"
                    _verified:
                      all_confirmed: true
        "401":
          $ref: "#/components/responses/Unauthorized"
        "402":
          description: "Quota exhausted"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"

  /api/jobs:
    get:
      operationId: listJobs
      summary: "List user's documents"
      description: "Returns list of processed documents for the authenticated user."
      tags: [Documents]
      security:
        - BearerAuth: []
      responses:
        "200":
          description: "Document list"
          content:
            application/json:
              schema:
                type: array
                items:
                  $ref: "#/components/schemas/JobSummary"
              example:
                - job: "factura_repso_9b1632d0"
                  original: "original.pdf"
                  has_result: true
                  filename: "original.pdf"
                  verified: true
                - job: "precios_8da465b9"
                  original: "original.xlsx"
                  has_result: true
                  filename: "original.xlsx"
                  verified: true

  /api/jobs/{job_name}:
    get:
      operationId: getJobDetail
      summary: "Get extraction result"
      description: "Returns the extraction result for a specific document."
      tags: [Documents]
      security:
        - BearerAuth: []
      parameters:
        - name: job_name
          in: path
          required: true
          schema:
            type: string
          description: "Job name"
      responses:
        "200":
          description: "Job detail"
          content:
            application/json:
              schema:
                type: object
                properties:
                  job:
                    type: string
                  filename:
                    type: string
                  results:
                    type: object
                    description: "Model ID → extraction result"
        "404":
          description: "Job not found"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"

    delete:
      operationId: deleteJob
      summary: "Delete document from history"
      description: |
        Soft-deletes document from user history. The cached extraction result
        is preserved for other users who uploaded the same document.
      tags: [Documents]
      security:
        - BearerAuth: []
      parameters:
        - name: job_name
          in: path
          required: true
          schema:
            type: string
      responses:
        "200":
          description: "Deleted"
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string

  /api/jobs/{job_name}/stages:
    get:
      operationId: getJobStages
      summary: "Get processing timeline"
      description: "Returns the processing pipeline stages for a job."
      tags: [Documents]
      parameters:
        - name: job_name
          in: path
          required: true
          schema:
            type: string
      responses:
        "200":
          description: "Processing stages"
          content:
            application/json:
              schema:
                type: object
                properties:
                  stages:
                    type: array
                    items:
                      $ref: "#/components/schemas/Stage"
                  verified:
                    type: object
              example:
                stages:
                  - stage: "preparando"
                    detail: "Documento: original.pdf"
                  - stage: "procesando"
                    detail: "Analizando documento PDF"
                  - stage: "estructurando"
                    detail: "Organizando datos extraídos"
                  - stage: "verificando"
                    detail: "Comprobando datos contra documento original"
                  - stage: "completado"
                    detail: "✅ Datos verificados"
                verified:
                  all_confirmed: true

  /api/me:
    get:
      operationId: getCurrentUser
      summary: "Current user info"
      description: "Returns authenticated user's profile and quota information."
      tags: [Users, Authentication]
      security:
        - BearerAuth: []
      responses:
        "200":
          description: "User profile"
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UserProfile"

  /api/billing/upgrade:
    post:
      operationId: upgradePlan
      summary: "Upgrade plan"
      description: "Request a plan upgrade (Stripe integration pending)."
      tags: [Users, Billing]
      security:
        - BearerAuth: []
      requestBody:
        content:
          application/json:
            schema:
              type: object
              properties:
                plan:
                  type: string
                  enum: ["free", "pro", "enterprise"]
                  default: "pro"
      responses:
        "200":
          description: "Upgrade info"
          content:
            application/json:
              schema:
                type: object
                properties:
                  user_id:
                    type: integer
                  current_plan:
                    type: string
                  requested_plan:
                    type: string
                  monthly_price_eur:
                    type: number
                  pages_limit:
                    type: integer

  /:
    get:
      operationId: landingPage
      summary: "Landing page (HTML)"
      description: "Returns the marketing landing page."
      tags: [Web]
      responses:
        "200":
          description: "Landing page HTML"

  /app:
    get:
      operationId: appUI
      summary: "Application UI (HTML)"
      description: "Returns the extraction dashboard UI."
      tags: [Web]
      responses:
        "200":
          description: "App UI HTML"

components:
  securitySchemes:
    BearerAuth:
      type: http
      scheme: bearer
      bearerFormat: "JWT"
      description: "Clerk session token or API key. Format: `Bearer ocr_xxx` or `Bearer jwt_token`"

  schemas:
    HealthResponse:
      type: object
      required: [status, models, jobs]
      properties:
        status:
          type: string
          enum: [ok, error]
        models:
          type: array
          items:
            type: string
          description: "Available model IDs"
        jobs:
          type: integer
          description: "Number of existing jobs"

    SignupRequest:
      type: object
      required: [email]
      properties:
        email:
          type: string
          format: email
          description: "User email address"
        plan:
          type: string
          enum: [free, pro, enterprise]
          default: free
          description: "Subscription plan"

    SignupResponse:
      type: object
      properties:
        user_id:
          type: integer
        email:
          type: string
        plan:
          type: string
        pages_limit:
          type: integer
        api_key:
          type: string
          description: "API key — show only once!"
        warning:
          type: string

    ModelInfo:
      type: object
      properties:
        id:
          type: string
          description: "Model identifier"
        name:
          type: string
          description: "Human-readable model name"
        supports_pdf:
          type: boolean
          description: "Whether model supports native PDF vision"

    UploadResponse:
      type: object
      properties:
        job:
          type: string
          description: "Job name for /api/analyze/{job_name}"
        filename:
          type: string
        size:
          type: integer
          description: "File size in bytes"
        status:
          type: string
          enum: [uploaded]
        doc_type:
          type: string
          enum: [invoice, price_list, email, generic]
        recommended_model:
          type: string
          nullable: true

    AnalyzeResponse:
      type: object
      properties:
        job:
          type: string
        filename:
          type: string
        results:
          type: object
          description: "Map of model_id → extraction result"

    JobSummary:
      type: object
      properties:
        job:
          type: string
        original:
          type: string
        has_result:
          type: boolean
        filename:
          type: string
        verified:
          type: boolean
          nullable: true

    UserProfile:
      type: object
      properties:
        user_id:
          type: integer
        email:
          type: string
        plan:
          type: string
        pages_used:
          type: integer
        pages_limit:
          type: integer
        pages_remaining:
          type: integer

    Stage:
      type: object
      properties:
        stage:
          type: string
          enum:
            - preparando
            - procesando
            - estructurando
            - verificando
            - completado
            - error
            - reintentando
        detail:
          type: string
          description: "Human-readable stage description"
        time:
          type: number
          description: "Unix timestamp"

    ErrorResponse:
      type: object
      properties:
        detail:
          type: string
        error:
          type: string

  responses:
    Unauthorized:
      description: "Missing or invalid authentication"
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
          example:
            detail: "Invalid API key"

tags:
  - name: Documents
    description: "Document upload, analysis, and management"
  - name: Extraction
    description: "Document intelligence and data extraction"
  - name: Users
    description: "User accounts and billing"
  - name: Authentication
    description: "Authentication and API keys"
  - name: System
    description: "System health and metadata"
  - name: Models
    description: "LLM model information"
  - name: Billing
    description: "Subscription and billing"
  - name: Web
    description: "Web interface endpoints"
