From 0db2482bf26f6304c98cf9bd4d85a3d1ae844173 Mon Sep 17 00:00:00 2001 From: m3tm3re Date: Wed, 4 Feb 2026 19:19:44 +0100 Subject: [PATCH] feat(project): initialize ZUGFeRD service with pyproject.toml and directory structure --- .gitignore | 42 + .sisyphus/boulder.json | 8 + .sisyphus/drafts/zugferd-service.md | 130 ++ .../notepads/zugferd-service/decisions.md | 29 + .sisyphus/notepads/zugferd-service/issues.md | 7 + .../notepads/zugferd-service/learnings.md | 44 + .../notepads/zugferd-service/problems.md | 7 + .sisyphus/plans/zugferd-service.md | 1764 +++++++++++++++++ pyproject.toml | 35 + src/__init__.py | 3 + src/extractor.py | 3 + src/main.py | 20 + src/models.py | 3 + src/pdf_parser.py | 3 + src/utils.py | 3 + src/validator.py | 3 + tests/__init__.py | 1 + tests/conftest.py | 9 + tests/fixtures/.gitkeep | 0 19 files changed, 2114 insertions(+) create mode 100644 .gitignore create mode 100644 .sisyphus/boulder.json create mode 100644 .sisyphus/drafts/zugferd-service.md create mode 100644 .sisyphus/notepads/zugferd-service/decisions.md create mode 100644 .sisyphus/notepads/zugferd-service/issues.md create mode 100644 .sisyphus/notepads/zugferd-service/learnings.md create mode 100644 .sisyphus/notepads/zugferd-service/problems.md create mode 100644 .sisyphus/plans/zugferd-service.md create mode 100644 pyproject.toml create mode 100644 src/__init__.py create mode 100644 src/extractor.py create mode 100644 src/main.py create mode 100644 src/models.py create mode 100644 src/pdf_parser.py create mode 100644 src/utils.py create mode 100644 src/validator.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/.gitkeep diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e1559e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,42 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# Sisyphus internal +.sisyphus/boulder.json +.sisyphus/notepads/*/issues.md +.sisyphus/notepads/*/problems.md diff --git a/.sisyphus/boulder.json b/.sisyphus/boulder.json new file mode 100644 index 0000000..c91693f --- /dev/null +++ b/.sisyphus/boulder.json @@ -0,0 +1,8 @@ +{ + "active_plan": "/home/m3tam3re/p/AZ/zugferd-service/.sisyphus/plans/zugferd-service.md", + "started_at": "2026-02-04T18:12:44.865Z", + "session_ids": [ + "ses_3d634d45bffeW59tJMBkcCfwyd" + ], + "plan_name": "zugferd-service" +} \ No newline at end of file diff --git a/.sisyphus/drafts/zugferd-service.md b/.sisyphus/drafts/zugferd-service.md new file mode 100644 index 0000000..7efaa66 --- /dev/null +++ b/.sisyphus/drafts/zugferd-service.md @@ -0,0 +1,130 @@ +# Draft: ZUGFeRD-Service Implementation + +## Requirements (confirmed) + +### Core Functionality +- **Purpose**: Python REST API for ZUGFeRD/Factur-X invoice extraction and validation +- **Framework**: FastAPI (preferred by user) +- **Runtime**: Python 3.11+ +- **Deployment**: Docker container on NixOS server + native Nix package + +### API Endpoints +1. `GET /health` - Health check endpoint +2. `POST /extract` - PDF extraction (accepts base64-encoded PDF) +3. `POST /validate` - Invoice validation (pflichtfelder, betraege, ustid, pdf_abgleich) + +### Key Dependencies +- `factur-x>=2.5` - ZUGFeRD/Factur-X extraction +- `pypdf>=4.0.0` - PDF text extraction +- `fastapi>=0.109.0` - API framework +- `uvicorn>=0.27.0` - ASGI server +- `pydantic>=2.5.0` - Data models +- `lxml>=5.0.0` - XML parsing +- `python-multipart>=0.0.6` - File uploads + +### Project Structure (user-specified) +``` +zugferd-service/ +├── Dockerfile +├── requirements.txt +├── README.md +├── src/ +│ ├── __init__.py +│ ├── main.py # FastAPI App + Endpoints +│ ├── extractor.py # ZUGFeRD/PDF Extraktion +│ ├── validator.py # Validierungslogik +│ ├── pdf_parser.py # PDF-Text-Parsing für Abgleich +│ ├── models.py # Pydantic Models +│ └── utils.py # Hilfsfunktionen +├── tests/ +│ ├── __init__.py +│ ├── test_extractor.py +│ ├── test_validator.py +│ └── fixtures/ +│ ├── sample_zugferd.pdf +│ └── sample_no_zugferd.pdf +└── docker-compose.yml +``` + +## Research Findings + +### Nix Packaging (from librarian research) +- Use `buildPythonApplication` for standalone service +- `pyproject = true` with hatchling/setuptools +- `pythonRelaxDeps = true` for dependency flexibility +- mem0 example pattern: custom server script via `postInstall` +- Consider flake.nix for modern Nix workflow + +### factur-x Library (from librarian research) +- `get_xml_from_pdf()` - Core extraction function +- `get_level()` / `get_flavor()` - Profile detection +- Namespaces: rsm, ram, udt for UN/CEFACT CII format +- Profile levels: minimum, basicwl, basic, en16931, extended + +### UN/ECE Unit Codes +- C62 = Piece, KGM = Kilogram, H87 = Piece (alt) +- Need comprehensive mapping dictionary + +## Technical Decisions + +### Python Tooling +- **PENDING**: Use pyproject.toml (modern) or requirements.txt (legacy)? +- **PENDING**: Build system: setuptools, hatchling, or poetry-core? + +### Nix Approach +- **PENDING**: Flake-based or traditional Nix expressions? +- **PENDING**: Include NixOS service module? + +### Testing Strategy +- **PENDING**: TDD or tests-after? +- **PENDING**: Test framework: pytest (standard choice) + +## Scope Boundaries + +### INCLUDE +- All 3 API endpoints as specified +- All validation checks (pflichtfelder, betraege, ustid, pdf_abgleich) +- Docker multi-stage build +- Nix packaging +- Basic test suite +- README documentation + +### EXCLUDE +- Online USt-ID validation (only format check) +- Database/persistence (stateless service) +- Authentication/authorization +- Rate limiting +- Metrics/tracing + +## Open Questions (RESOLVED) + +1. ✅ **Python project structure**: pyproject.toml with hatchling +2. ✅ **Build system**: hatchling (modern, Nix-friendly) +3. ✅ **Nix approach**: Flake-based +4. ✅ **Testing**: TDD (test-first) with pytest +5. ✅ **Sample PDFs**: Source from official ZUGFeRD repositories + +## Metis Gap Analysis (Reviewed) + +### Gaps Classified as MINOR (Auto-Resolved) +- **UN/ECE unit codes**: Start with common codes (C62, KGM, H87, MTR, LTR, etc.), expand as needed +- **Tolerance**: Hardcode 0.01 EUR as specified +- **Validation scope**: Check required fields exist for declared profile +- **Error codes**: Implement as specified in user's detailed spec + +### Gaps Classified as DEFAULTS APPLIED +- **Authentication**: OPEN (no auth mentioned in spec → stateless public API) +- **ZUGFeRD profiles**: ALL profiles supported (MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED) +- **Deployment**: Container-based on NixOS (as per NixOS config section in spec) +- **PDF text extraction**: REQUIRED for pdf_abgleich check (explicitly in spec) +- **File size limit**: Handle as error for >10MB (spec mentions this edge case) + +### Guardrails (Must NOT Have) +- NO authentication middleware +- NO database/persistence +- NO caching layers +- NO rate limiting +- NO metrics endpoints (beyond /health) +- NO CLI interface +- NO web UI +- NO abstraction layers for "future extensibility" diff --git a/.sisyphus/notepads/zugferd-service/decisions.md b/.sisyphus/notepads/zugferd-service/decisions.md new file mode 100644 index 0000000..037a730 --- /dev/null +++ b/.sisyphus/notepads/zugferd-service/decisions.md @@ -0,0 +1,29 @@ +# Decisions - zugferd-service + +This file accumulates architectural and design decisions during execution. + +## [2026-02-04T18:12:44.864Z] Session Start +Planning decisions from Prometheus session. + +## Tech Stack +- Python 3.11+ +- FastAPI for REST API +- Pydantic for data validation +- pytest for testing +- factur-x library for ZUGFeRD extraction +- pypdf for PDF text extraction + +## Scope Lock +- NO authentication (stateless service) +- NO database +- NO caching +- NO rate limiting +- 10MB file size limit +- 0.01 EUR tolerance for calculations + +## Validation Rules +- pflichtfelder: Required fields check +- betraege: Amount calculations check +- ustid: VAT ID format check (no online validation) +- pdf_abgleich: XML vs PDF text comparison + diff --git a/.sisyphus/notepads/zugferd-service/issues.md b/.sisyphus/notepads/zugferd-service/issues.md new file mode 100644 index 0000000..a95d7dc --- /dev/null +++ b/.sisyphus/notepads/zugferd-service/issues.md @@ -0,0 +1,7 @@ +# Issues - zugferd-service + +This file accumulates problems, errors, and gotchas during execution. + +## [2026-02-04T18:12:44.864Z] Session Start +Initial session started. No issues yet. + diff --git a/.sisyphus/notepads/zugferd-service/learnings.md b/.sisyphus/notepads/zugferd-service/learnings.md new file mode 100644 index 0000000..24331e0 --- /dev/null +++ b/.sisyphus/notepads/zugferd-service/learnings.md @@ -0,0 +1,44 @@ +# Learnings - zugferd-service + +This file accumulates conventions, patterns, and learnings during execution. + +## [2026-02-04T18:12:44.864Z] Session Start +Initial session for ZUGFeRD-Service implementation. + +## Framework Decisions +- FastAPI (user preference) +- Pydantic v2+ for data models +- pytest with pytest-asyncio for testing +- hatchling for build system + +## Packaging Decisions +- pyproject.toml (modern Python packaging) +- Docker multi-stage build +- Nix flake-based packaging with buildPythonApplication + +## Testing Decisions +- TDD (test-first) approach +- All acceptance criteria must be verifiable without human intervention + + +## [2026-02-04T19:14:00.000Z] Task 1: Project Scaffold + +### hatchling Configuration Pattern +- For src-layout projects, MUST add `[tool.hatch.build.targets.wheel]` section +- Without this, hatchling cannot determine which files to ship +- Config: `packages = ["src"]` to specify src directory + +### Nix Environment Considerations +- Nix store is read-only, standard pip install fails +- Use temporary venv for verification: `python -m venv /tmp/test_env` +- Install to venv, verify imports, then cleanup + +### Entry Point Documentation +- Functions referenced in `[project.scripts]` MUST have docstrings +- These are public API entry points (CLI commands) +- Example: `zugferd-service = "src.main:run"` -> run() needs docstring + +### Module Docstring Convention +- Module-level docstrings: minimal, one line, describe purpose +- Entry point function docstrings: Args/Returns style for CLI documentation +- Both necessary for scaffolding clarity diff --git a/.sisyphus/notepads/zugferd-service/problems.md b/.sisyphus/notepads/zugferd-service/problems.md new file mode 100644 index 0000000..65abfb3 --- /dev/null +++ b/.sisyphus/notepads/zugferd-service/problems.md @@ -0,0 +1,7 @@ +# Problems - zugferd-service + +This file accumulates unresolved blockers. + +## [2026-02-04T18:12:44.864Z] Session Start +No unresolved problems at this time. + diff --git a/.sisyphus/plans/zugferd-service.md b/.sisyphus/plans/zugferd-service.md new file mode 100644 index 0000000..43533f8 --- /dev/null +++ b/.sisyphus/plans/zugferd-service.md @@ -0,0 +1,1764 @@ +# ZUGFeRD-Service: Python REST API for Invoice Extraction and Validation + +## TL;DR + +> **Quick Summary**: Build a stateless FastAPI service that extracts structured data from ZUGFeRD/Factur-X invoices embedded in PDFs, validates invoice correctness, and is packaged for both Docker and NixOS deployment. +> +> **Deliverables**: +> - Complete FastAPI application with 3 endpoints (`/health`, `/extract`, `/validate`) +> - Pydantic models for all request/response schemas +> - ZUGFeRD extraction pipeline using factur-x library +> - 4 validation checks (pflichtfelder, betraege, ustid, pdf_abgleich) +> - Docker multi-stage build (production-ready) +> - Nix flake packaging (buildPythonApplication) +> - Comprehensive test suite (TDD, pytest) +> - README documentation with installation guide +> +> **Estimated Effort**: Medium-Large (20-30 tasks) +> **Parallel Execution**: YES - 4 waves +> **Critical Path**: Project Setup → Models → Extractor → Validator → API → Packaging + +--- + +## Context + +### Original Request +Build a Python-based REST API service for ZUGFeRD invoice extraction and validation. The service runs as a Docker container on NixOS and should also be packaged as a Nix flake. Key requirements include handling all ZUGFeRD profiles (MINIMUM through EXTENDED), validating invoice amounts and required fields, and comparing XML data against PDF text. + +### Interview Summary +**Key Discussions**: +- **Framework choice**: FastAPI (user preference, excellent for REST APIs) +- **Project format**: pyproject.toml with hatchling (modern, Nix-friendly) +- **Nix packaging**: Flake-based approach with buildPythonApplication +- **Testing strategy**: TDD with pytest, source official sample PDFs +- **Authentication**: Open endpoints (no auth required) +- **File handling**: 10MB limit, reject password-protected PDFs +- **Calculations**: Standard rounding, 0.01 EUR tolerance + +**Research Findings**: +- factur-x library provides `get_xml_from_pdf()`, `get_level()`, `get_flavor()` for extraction +- Nix packaging follows mem0 pattern: pyproject=true, pythonRelaxDeps=true +- UN/ECE unit codes require mapping dictionary (C62→Piece, KGM→Kilogram, etc.) +- ZUGFeRD profiles detected via GuidelineSpecifiedDocumentContextParameter in XML + +### Metis Review +**Identified Gaps** (addressed): +- Password-protected PDFs → Reject with error +- File size limits → 10MB hard limit +- Decimal precision → Standard rounding +- Authentication → Confirmed open/stateless + +--- + +## Work Objectives + +### Core Objective +Create a production-ready, stateless REST API that extracts ZUGFeRD/Factur-X invoice data from PDFs, validates invoice correctness against business rules, and provides clear error reporting for invoice processing workflows. + +### Concrete Deliverables +- `src/main.py` - FastAPI application with all endpoints +- `src/models.py` - Pydantic models for all data structures +- `src/extractor.py` - ZUGFeRD extraction logic +- `src/validator.py` - Validation checks implementation +- `src/pdf_parser.py` - PDF text extraction for cross-validation +- `src/utils.py` - Helper functions (unit codes, decimal handling) +- `pyproject.toml` - Project configuration with hatchling +- `Dockerfile` - Multi-stage production build +- `docker-compose.yml` - Local development setup +- `flake.nix` - Nix flake packaging +- `tests/` - Complete test suite with fixtures +- `README.md` - Installation and usage documentation + +### Definition of Done +- [ ] `nix build .#zugferd-service` completes without errors +- [ ] `docker build -t zugferd-service .` produces image <500MB +- [ ] `pytest` runs all tests with 100% pass rate +- [ ] `curl http://localhost:5000/health` returns `{"status": "healthy", "version": "1.0.0"}` +- [ ] All ZUGFeRD profiles correctly detected from sample PDFs +- [ ] All validation checks produce expected errors/warnings + +### Must Have +- All 3 API endpoints as specified +- Support for all ZUGFeRD profiles (MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED) +- All 4 validation checks (pflichtfelder, betraege, ustid, pdf_abgleich) +- Structured JSON error responses with error codes +- UTF-8 encoding throughout +- Structured JSON logging + +### Must NOT Have (Guardrails) +- ❌ Authentication middleware (open endpoints) +- ❌ Database or persistence layer (stateless only) +- ❌ Caching layers +- ❌ Rate limiting +- ❌ Metrics endpoints beyond /health +- ❌ CLI interface +- ❌ Web UI or admin dashboard +- ❌ Batch processing or queue system +- ❌ Online USt-ID validation (format check only) +- ❌ Support for ZUGFeRD 1.x (only 2.x) +- ❌ Abstraction layers for "future extensibility" + +--- + +## Verification Strategy + +> **UNIVERSAL RULE: ZERO HUMAN INTERVENTION** +> +> ALL tasks in this plan MUST be verifiable WITHOUT any human action. +> Every verification step is executed by the agent using tools (curl, pytest, nix, docker). + +### Test Decision +- **Infrastructure exists**: NO (new project) +- **Automated tests**: TDD (test-first) +- **Framework**: pytest with pytest-asyncio + +### TDD Workflow +Each implementation task follows RED-GREEN-REFACTOR: +1. **RED**: Write failing test first +2. **GREEN**: Implement minimum code to pass +3. **REFACTOR**: Clean up while keeping tests green + +### Test Infrastructure Setup (Task 0) +```bash +# Install pytest and test dependencies +pip install pytest pytest-asyncio httpx + +# Verify pytest works +pytest --version + +# Run initial test +pytest tests/ -v +``` + +### Agent-Executed QA Scenarios (MANDATORY) + +All verifications use: +- **API Testing**: `curl` commands with JSON assertions +- **Docker Testing**: `docker build` and `docker run` commands +- **Nix Testing**: `nix build` and `nix run` commands +- **Unit Testing**: `pytest` with specific test file targets + +--- + +## Execution Strategy + +### Parallel Execution Waves + +``` +Wave 1 (Start Immediately): +├── Task 1: Project scaffold (pyproject.toml, directories) +├── Task 2: Download ZUGFeRD sample PDFs +└── Task 3: Create Pydantic models + +Wave 2 (After Wave 1): +├── Task 4: Extractor unit tests + implementation +├── Task 5: PDF parser unit tests + implementation +└── Task 6: Utils (unit codes, decimal handling) + +Wave 3 (After Wave 2): +├── Task 7: Validator unit tests + implementation +├── Task 8: FastAPI app structure +└── Task 9: Health endpoint + +Wave 4 (After Wave 3): +├── Task 10: Extract endpoint +├── Task 11: Validate endpoint +└── Task 12: Error handling middleware + +Wave 5 (After Wave 4): +├── Task 13: Integration tests +├── Task 14: Dockerfile +└── Task 15: docker-compose.yml + +Wave 6 (After Wave 5): +├── Task 16: Nix flake packaging +├── Task 17: NixOS module example +└── Task 18: README documentation + +Critical Path: Task 1 → Task 4 → Task 7 → Task 10 → Task 13 → Task 16 +``` + +### Dependency Matrix + +| Task | Depends On | Blocks | Can Parallelize With | +|------|------------|--------|---------------------| +| 1 | None | 2-18 | None (must be first) | +| 2 | 1 | 4, 5 | 3 | +| 3 | 1 | 4, 5, 7 | 2 | +| 4 | 2, 3 | 7, 10 | 5, 6 | +| 5 | 2 | 7 | 4, 6 | +| 6 | 1 | 4, 7 | 4, 5 | +| 7 | 4, 5, 6 | 11 | 8, 9 | +| 8 | 3 | 9, 10, 11 | 7 | +| 9 | 8 | 10, 11 | 7 | +| 10 | 4, 8, 9 | 13 | 11, 12 | +| 11 | 7, 8, 9 | 13 | 10, 12 | +| 12 | 8 | 13 | 10, 11 | +| 13 | 10, 11, 12 | 14, 16 | None | +| 14 | 13 | 16 | 15 | +| 15 | 13 | None | 14 | +| 16 | 14 | 17, 18 | None | +| 17 | 16 | 18 | None | +| 18 | 16, 17 | None | None | + +--- + +## TODOs + +### Wave 1: Project Foundation + +- [x] 1. Project Scaffold and Configuration + + **What to do**: + - Create project directory structure as specified + - Create `pyproject.toml` with hatchling build system + - Configure pytest in pyproject.toml + - Create `src/__init__.py` and `tests/__init__.py` + - Set up Python version requirement (3.11+) + + **Must NOT do**: + - Do NOT add optional dependencies + - Do NOT create README yet (separate task) + - Do NOT add pre-commit hooks + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Straightforward file creation with established patterns + - **Skills**: [`git-master`] + - `git-master`: For proper initial commit after scaffold + + **Parallelization**: + - **Can Run In Parallel**: NO + - **Parallel Group**: Sequential (must be first) + - **Blocks**: All subsequent tasks + - **Blocked By**: None + + **References**: + - **Pattern References**: mem0 pyproject.toml pattern (from librarian research) + - **External References**: https://hatch.pypa.io/latest/config/metadata/ (hatchling docs) + + **Files to Create**: + ``` + pyproject.toml + src/__init__.py + src/main.py (placeholder) + src/models.py (placeholder) + src/extractor.py (placeholder) + src/validator.py (placeholder) + src/pdf_parser.py (placeholder) + src/utils.py (placeholder) + tests/__init__.py + tests/conftest.py (pytest fixtures) + tests/fixtures/.gitkeep + ``` + + **pyproject.toml Structure**: + ```toml + [build-system] + requires = ["hatchling"] + build-backend = "hatchling.build" + + [project] + name = "zugferd-service" + version = "1.0.0" + description = "REST API for ZUGFeRD invoice extraction and validation" + requires-python = ">=3.11" + dependencies = [ + "fastapi>=0.109.0", + "uvicorn>=0.27.0", + "python-multipart>=0.0.6", + "factur-x>=2.5", + "pypdf>=4.0.0", + "pydantic>=2.5.0", + "lxml>=5.0.0", + ] + + [project.optional-dependencies] + dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "httpx>=0.27.0", + ] + + [project.scripts] + zugferd-service = "src.main:run" + + [tool.pytest.ini_options] + asyncio_mode = "auto" + testpaths = ["tests"] + ``` + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: pyproject.toml is valid + Tool: Bash + Steps: + 1. python -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))" + 2. Assert: Exit code 0 + Expected Result: TOML parses without error + Evidence: Command output captured + + Scenario: Project structure exists + Tool: Bash + Steps: + 1. ls -la src/ + 2. Assert: main.py, models.py, extractor.py, validator.py, pdf_parser.py, utils.py exist + 3. ls -la tests/ + 4. Assert: conftest.py, fixtures/ exist + Expected Result: All required files present + Evidence: Directory listing captured + + Scenario: Dependencies install correctly + Tool: Bash + Steps: + 1. pip install -e ".[dev]" + 2. Assert: Exit code 0 + 3. python -c "import fastapi; import facturx; import pypdf" + 4. Assert: Exit code 0 + Expected Result: All dependencies importable + Evidence: pip output captured + ``` + + **Commit**: YES + - Message: `feat(project): initialize ZUGFeRD service with pyproject.toml and directory structure` + - Files: All created files + - Pre-commit: `python -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))"` + +--- + +- [ ] 2. Download ZUGFeRD Sample PDFs + + **What to do**: + - Download official ZUGFeRD sample PDFs from FeRD/ZUGFeRD repositories + - Include samples for: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED profiles + - Include a non-ZUGFeRD PDF for negative testing + - Store in `tests/fixtures/` + - Create fixture manifest documenting each file + + **Must NOT do**: + - Do NOT create synthetic PDFs + - Do NOT download more than 10 samples (keep focused) + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Download and organize files + - **Skills**: [] + - No special skills needed + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Task 3) + - **Blocks**: Tasks 4, 5 + - **Blocked By**: Task 1 + + **References**: + - **External References**: + - https://www.ferd-net.de/download/testrechnungen + - https://github.com/ZUGFeRD/mustangproject/tree/master/Mustang-CLI/src/test/resources + - https://github.com/akretion/factur-x/tree/master/tests + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Sample PDFs exist for all profiles + Tool: Bash + Steps: + 1. ls tests/fixtures/*.pdf | wc -l + 2. Assert: At least 5 PDF files + 3. file tests/fixtures/*.pdf + 4. Assert: All files identified as "PDF document" + Expected Result: Multiple valid PDF files in fixtures + Evidence: File listing captured + + Scenario: Fixture manifest documents samples + Tool: Bash + Steps: + 1. cat tests/fixtures/MANIFEST.md + 2. Assert: Contains entries for MINIMUM, BASIC, EN16931, EXTENDED + Expected Result: Manifest describes all test fixtures + Evidence: Manifest content captured + ``` + + **Commit**: YES + - Message: `test(fixtures): add official ZUGFeRD sample PDFs for all profiles` + - Files: `tests/fixtures/*.pdf`, `tests/fixtures/MANIFEST.md` + +--- + +- [ ] 3. Create Pydantic Models + + **What to do**: + - Define all Pydantic models as per API specification + - Models: ExtractRequest, ExtractResponse, ValidateRequest, ValidateResponse + - Nested models: Supplier, Buyer, LineItem, Totals, VatBreakdown, PaymentTerms + - Error models: ErrorResponse, ValidationError + - Add field validators where appropriate (e.g., VAT ID format) + + **Must NOT do**: + - Do NOT add ORM mappings (no database) + - Do NOT add serialization beyond Pydantic defaults + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Straightforward Pydantic model definitions from spec + - **Skills**: [] + - Standard Python work + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Task 2) + - **Blocks**: Tasks 4, 5, 7 + - **Blocked By**: Task 1 + + **References**: + - **API Specification**: User's detailed spec document (request/response schemas) + - **Pattern References**: https://docs.pydantic.dev/latest/concepts/models/ + + **Key Models from Spec**: + ```python + class Supplier(BaseModel): + name: str + street: str | None = None + postal_code: str | None = None + city: str | None = None + country: str | None = None + vat_id: str | None = None + email: str | None = None + + class LineItem(BaseModel): + position: int + article_number: str | None = None + article_number_buyer: str | None = None + description: str + quantity: float + unit: str # Human-readable, translated from UN/ECE code + unit_price: float + line_total: float + vat_rate: float | None = None + vat_amount: float | None = None + + class ExtractResponse(BaseModel): + is_zugferd: bool + zugferd_profil: str | None = None + xml_raw: str | None = None + xml_data: XmlData | None = None + pdf_text: str | None = None + extraction_meta: ExtractionMeta + ``` + + **Acceptance Criteria**: + + **TDD - Write Tests First:** + ```python + # tests/test_models.py + def test_extract_response_zugferd(): + response = ExtractResponse( + is_zugferd=True, + zugferd_profil="EN16931", + xml_raw=" ExtractResponse: + """Extract ZUGFeRD data from PDF bytes.""" + # 1. Check file size (<10MB) + # 2. Try to extract XML using factur-x + # 3. Detect profile and flavor + # 4. Parse XML to structured data + # 5. Extract PDF text for pdf_abgleich + # 6. Return ExtractResponse + ``` + + **Acceptance Criteria**: + + **TDD - Write Tests First:** + ```python + # tests/test_extractor.py + def test_extract_en16931_profile(sample_en16931_pdf): + result = extract_zugferd(sample_en16931_pdf) + assert result.is_zugferd is True + assert result.zugferd_profil == "EN16931" + assert result.xml_data is not None + assert result.xml_data.invoice_number is not None + + def test_extract_non_zugferd_pdf(sample_plain_pdf): + result = extract_zugferd(sample_plain_pdf) + assert result.is_zugferd is False + assert result.xml_data is None + assert result.pdf_text is not None + + def test_extract_corrupt_pdf(): + with pytest.raises(ExtractionError) as exc: + extract_zugferd(b"not a pdf") + assert exc.value.error_code == "invalid_pdf" + + def test_file_size_limit(): + large_pdf = b"x" * (11 * 1024 * 1024) # 11MB + with pytest.raises(ExtractionError) as exc: + extract_zugferd(large_pdf) + assert exc.value.error_code == "file_too_large" + ``` + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Extractor passes all unit tests + Tool: Bash + Steps: + 1. pytest tests/test_extractor.py -v + 2. Assert: All tests pass + Expected Result: 100% test pass rate + Evidence: pytest output captured + + Scenario: EN16931 profile correctly detected + Tool: Bash + Steps: + 1. python -c " + from src.extractor import extract_zugferd + with open('tests/fixtures/sample_en16931.pdf', 'rb') as f: + result = extract_zugferd(f.read()) + print(f'Profile: {result.zugferd_profil}') + print(f'Invoice: {result.xml_data.invoice_number}')" + 2. Assert: Output contains "Profile: EN16931" or "Profile: en16931" + Expected Result: Profile correctly identified + Evidence: Script output captured + + Scenario: Non-ZUGFeRD PDF handled gracefully + Tool: Bash + Steps: + 1. python -c " + from src.extractor import extract_zugferd + with open('tests/fixtures/sample_no_zugferd.pdf', 'rb') as f: + result = extract_zugferd(f.read()) + assert result.is_zugferd == False + assert result.pdf_text is not None + print('OK: Non-ZUGFeRD handled correctly')" + 2. Assert: Output contains "OK:" + Expected Result: Graceful handling + Evidence: Script output captured + ``` + + **Commit**: YES + - Message: `feat(extractor): implement ZUGFeRD extraction with profile detection` + - Files: `src/extractor.py`, `tests/test_extractor.py` + - Pre-commit: `pytest tests/test_extractor.py` + +--- + +- [ ] 5. PDF Text Parser Implementation (TDD) + + **What to do**: + - Write tests first with expected extraction patterns + - Implement PDF text extraction using pypdf + - Create regex patterns for extracting key values from text + - Extract: invoice_number, invoice_date, amounts (net, gross, vat) + - Return confidence scores for each extraction + + **Must NOT do**: + - Do NOT use OCR (text extraction only) + - Do NOT parse images in PDFs + + **Recommended Agent Profile**: + - **Category**: `unspecified-high` + - Reason: Regex pattern development requires careful testing + - **Skills**: [`systematic-debugging`] + - `systematic-debugging`: For developing and testing regex patterns + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 2 (with Tasks 4, 6) + - **Blocks**: Task 7 + - **Blocked By**: Task 2 + + **References**: + - **User Spec**: Regex patterns provided in spec (invoice_number, gross_amount patterns) + - **Library Docs**: https://pypdf.readthedocs.io/en/stable/ + + **Key Patterns from Spec**: + ```python + EXTRACTION_PATTERNS = { + "invoice_number": [ + r"Rechnungs?-?(?:Nr|Nummer)[.:\s]*([A-Z0-9\-]+)", + r"Invoice\s*(?:No|Number)?[.:\s]*([A-Z0-9\-]+)", + r"Beleg-?Nr[.:\s]*([A-Z0-9\-]+)" + ], + "gross_amount": [ + r"Brutto[:\s]*([0-9.,]+)\s*(?:EUR|€)?", + r"Gesamtbetrag[:\s]*([0-9.,]+)", + r"Total[:\s]*([0-9.,]+)\s*(?:EUR|€)?" + ], + # ... more patterns + } + ``` + + **Acceptance Criteria**: + + **TDD - Write Tests First:** + ```python + # tests/test_pdf_parser.py + def test_extract_invoice_number(): + text = "Rechnung\nRechnungs-Nr.: RE-2025-001234\nDatum: 04.02.2025" + result = extract_from_text(text) + assert result["invoice_number"] == "RE-2025-001234" + assert result["invoice_number_confidence"] >= 0.9 + + def test_extract_amounts(): + text = "Netto: 100,00 EUR\nMwSt 19%: 19,00 EUR\nBrutto: 119,00 EUR" + result = extract_from_text(text) + assert result["gross_amount"] == 119.00 + assert result["net_amount"] == 100.00 + + def test_german_number_format(): + text = "Gesamtbetrag: 1.234,56 €" + result = extract_from_text(text) + assert result["gross_amount"] == 1234.56 + ``` + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: PDF parser passes all unit tests + Tool: Bash + Steps: + 1. pytest tests/test_pdf_parser.py -v + 2. Assert: All tests pass + Expected Result: 100% test pass rate + Evidence: pytest output captured + + Scenario: Real PDF text extraction works + Tool: Bash + Steps: + 1. python -c " + from src.pdf_parser import extract_text_from_pdf, extract_from_text + with open('tests/fixtures/sample_en16931.pdf', 'rb') as f: + text = extract_text_from_pdf(f.read()) + print(f'Extracted {len(text)} characters') + result = extract_from_text(text) + print(f'Invoice: {result.get(\"invoice_number\", \"NOT FOUND\")}')" + 2. Assert: Output shows extracted characters and invoice number + Expected Result: Text extraction works on real PDFs + Evidence: Script output captured + ``` + + **Commit**: YES + - Message: `feat(pdf-parser): implement PDF text extraction with regex patterns` + - Files: `src/pdf_parser.py`, `tests/test_pdf_parser.py` + - Pre-commit: `pytest tests/test_pdf_parser.py` + +--- + +- [ ] 6. Utility Functions Implementation + + **What to do**: + - Create UN/ECE unit code mapping dictionary + - Implement decimal rounding helper (2 decimal places, standard rounding) + - Implement tolerance comparison function (0.01 EUR) + - Add German date format parser + + **Must NOT do**: + - Do NOT create configurable tolerance (hardcode 0.01) + - Do NOT add unit codes beyond common ones (expand later if needed) + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple utility functions + - **Skills**: [] + - Standard Python work + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 2 (with Tasks 4, 5) + - **Blocks**: Tasks 4, 7 + - **Blocked By**: Task 1 + + **References**: + - **UN/ECE Codes**: https://docs.peppol.eu/poacc/upgrade-3/codelist/UNECERec20/ + + **Unit Code Dictionary**: + ```python + UNECE_UNIT_CODES = { + "C62": "Stück", + "H87": "Stück", + "KGM": "Kilogramm", + "GRM": "Gramm", + "TNE": "Tonne", + "MTR": "Meter", + "KMT": "Kilometer", + "MTK": "Quadratmeter", + "LTR": "Liter", + "MLT": "Milliliter", + "DAY": "Tag", + "HUR": "Stunde", + "MON": "Monat", + "ANN": "Jahr", + "SET": "Set", + "PCE": "Stück", + "EA": "Stück", + } + ``` + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Unit code translation works + Tool: Bash + Steps: + 1. python -c " + from src.utils import translate_unit_code + assert translate_unit_code('C62') == 'Stück' + assert translate_unit_code('KGM') == 'Kilogramm' + assert translate_unit_code('UNKNOWN') == 'UNKNOWN' + print('OK: Unit codes translate correctly')" + 2. Assert: Output contains "OK:" + Expected Result: All translations correct + Evidence: Script output captured + + Scenario: Decimal comparison with tolerance + Tool: Bash + Steps: + 1. python -c " + from src.utils import amounts_match + assert amounts_match(100.00, 100.00) == True + assert amounts_match(100.00, 100.005) == True # Within 0.01 + assert amounts_match(100.00, 100.02) == False # Outside tolerance + print('OK: Tolerance comparison works')" + 2. Assert: Output contains "OK:" + Expected Result: Tolerance logic correct + Evidence: Script output captured + ``` + + **Commit**: YES + - Message: `feat(utils): add unit code mapping and decimal utilities` + - Files: `src/utils.py`, `tests/test_utils.py` + - Pre-commit: `pytest tests/test_utils.py` + +--- + +### Wave 3: Validation Logic + +- [ ] 7. Validator Implementation (TDD) + + **What to do**: + - Write tests first for each validation check + - Implement all 4 validation checks: + 1. `pflichtfelder` - Required fields check + 2. `betraege` - Amount calculations check + 3. `ustid` - VAT ID format check + 4. `pdf_abgleich` - XML vs PDF comparison + - Return structured ValidationResult with errors and warnings + + **Must NOT do**: + - Do NOT implement online USt-ID validation + - Do NOT add validation checks beyond spec + + **Recommended Agent Profile**: + - **Category**: `unspecified-high` + - Reason: Core business logic with multiple validation rules + - **Skills**: [`systematic-debugging`] + - `systematic-debugging`: For handling edge cases in validation logic + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 3 (with Tasks 8, 9) + - **Blocks**: Task 11 + - **Blocked By**: Tasks 4, 5, 6 + + **References**: + - **User Spec**: Detailed validation logic tables + - **Pattern References**: src/models.py ValidationError model + + **Validation Rules from Spec**: + + **pflichtfelder (Required Fields)**: + | Field | Severity | + |-------|----------| + | invoice_number | critical | + | invoice_date | critical | + | supplier.name | critical | + | supplier.vat_id | critical | + | buyer.name | critical | + | totals.net | critical | + | totals.gross | critical | + | totals.vat_total | critical | + | due_date | warning | + | payment_terms.iban | warning | + | line_items (min 1) | critical | + + **betraege (Calculations)**: + - line_total ≈ quantity × unit_price (±0.01) + - totals.net ≈ Σ(line_items.line_total) (±0.01) + - vat_breakdown.amount ≈ base × (rate/100) (±0.01) + - totals.vat_total ≈ Σ(vat_breakdown.amount) (±0.01) + - totals.gross ≈ totals.net + totals.vat_total (±0.01) + + **ustid (VAT ID Format)**: + | Country | Regex | + |---------|-------| + | DE | `^DE[0-9]{9}$` | + | AT | `^ATU[0-9]{8}$` | + | CH | `^CHE[0-9]{9}(MWST\|TVA\|IVA)$` | + + **Acceptance Criteria**: + + **TDD - Write Tests First:** + ```python + # tests/test_validator.py + def test_pflichtfelder_missing_invoice_number(): + data = XmlData(invoice_number=None, ...) + result = validate_pflichtfelder(data) + assert any(e.field == "invoice_number" for e in result.errors) + assert any(e.severity == "critical" for e in result.errors) + + def test_betraege_calculation_mismatch(): + data = XmlData( + line_items=[LineItem(quantity=10, unit_price=9.99, line_total=100.00)], # Wrong! + ... + ) + result = validate_betraege(data) + assert any(e.error_code == "calculation_mismatch" for e in result.errors) + assert any("99.90" in e.message for e in result.errors) + + def test_ustid_valid_german(): + result = validate_ustid("DE123456789") + assert result.is_valid is True + + def test_ustid_invalid_format(): + result = validate_ustid("DE12345") # Too short + assert result.is_valid is False + assert result.error_code == "invalid_format" + + def test_pdf_abgleich_mismatch(): + xml_data = XmlData(invoice_number="RE-001", totals=Totals(gross=118.88)) + pdf_values = {"invoice_number": "RE-002", "gross_amount": 118.88} + result = validate_pdf_abgleich(xml_data, pdf_values) + assert any(e.field == "invoice_number" for e in result.errors) + ``` + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Validator passes all unit tests + Tool: Bash + Steps: + 1. pytest tests/test_validator.py -v + 2. Assert: All tests pass + Expected Result: 100% test pass rate + Evidence: pytest output captured + + Scenario: All validation checks work together + Tool: Bash + Steps: + 1. python -c " + from src.validator import validate_invoice + from src.models import XmlData, ValidateRequest + # Create a request with known issues + request = ValidateRequest( + xml_data={'invoice_number': None, ...}, + checks=['pflichtfelder', 'betraege'] + ) + result = validate_invoice(request) + print(f'Errors: {len(result.errors)}')" + 2. Assert: Script runs without error, shows error count + Expected Result: Validator processes all checks + Evidence: Script output captured + ``` + + **Commit**: YES + - Message: `feat(validator): implement all validation checks (pflichtfelder, betraege, ustid, pdf_abgleich)` + - Files: `src/validator.py`, `tests/test_validator.py` + - Pre-commit: `pytest tests/test_validator.py` + +--- + +### Wave 3 (continued): API Foundation + +- [ ] 8. FastAPI Application Structure + + **What to do**: + - Create FastAPI app instance in main.py + - Configure exception handlers for custom errors + - Set up structured JSON logging + - Add CORS middleware (for local development) + - Configure app metadata (title, version, description) + + **Must NOT do**: + - Do NOT add authentication middleware + - Do NOT add rate limiting + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Standard FastAPI setup + - **Skills**: [] + - Standard FastAPI patterns + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 3 (with Tasks 7, 9) + - **Blocks**: Tasks 9, 10, 11 + - **Blocked By**: Task 3 + + **References**: + - **FastAPI Docs**: https://fastapi.tiangolo.com/ + - **Pattern References**: User spec error handling table + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: FastAPI app starts + Tool: Bash + Steps: + 1. timeout 5 uvicorn src.main:app --port 5001 & + 2. sleep 2 + 3. curl -s http://localhost:5001/openapi.json | head -c 100 + 4. Assert: Output contains "openapi" + 5. kill %1 + Expected Result: OpenAPI schema accessible + Evidence: curl output captured + ``` + + **Commit**: YES + - Message: `feat(api): create FastAPI application structure with error handling` + - Files: `src/main.py` + - Pre-commit: `python -c "from src.main import app; print(app.title)"` + +--- + +- [ ] 9. Health Endpoint Implementation + + **What to do**: + - Implement `GET /health` endpoint + - Return version from pyproject.toml + - Return simple health status + + **Must NOT do**: + - Do NOT add complex health checks (no dependencies to check) + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple endpoint + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 3 (with Tasks 7, 8) + - **Blocks**: Tasks 10, 11 + - **Blocked By**: Task 8 + + **References**: + - **User Spec**: Health endpoint response format + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Health endpoint returns correct format + Tool: Bash + Steps: + 1. uvicorn src.main:app --port 5002 & + 2. sleep 2 + 3. curl -s http://localhost:5002/health + 4. Assert: Response contains {"status": "healthy", "version": "1.0.0"} + 5. kill %1 + Expected Result: Health check passes + Evidence: curl output captured + ``` + + **Commit**: YES (groups with Task 8) + - Message: `feat(api): add health endpoint` + - Files: `src/main.py` + +--- + +### Wave 4: API Endpoints + +- [ ] 10. Extract Endpoint Implementation (TDD) + + **What to do**: + - Write integration tests for `/extract` endpoint + - Implement `POST /extract` endpoint + - Accept base64-encoded PDF in JSON body + - Use extractor module for extraction + - Return structured ExtractResponse + + **Must NOT do**: + - Do NOT accept multipart file uploads (JSON only per spec) + + **Recommended Agent Profile**: + - **Category**: `unspecified-high` + - Reason: Integration of extractor into API endpoint + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 4 (with Tasks 11, 12) + - **Blocks**: Task 13 + - **Blocked By**: Tasks 4, 8, 9 + + **References**: + - **User Spec**: /extract request/response format + - **Pattern References**: src/extractor.py + + **Acceptance Criteria**: + + **TDD - Write Tests First:** + ```python + # tests/test_api.py + @pytest.fixture + def client(): + return TestClient(app) + + def test_extract_zugferd_pdf(client, sample_en16931_pdf_base64): + response = client.post("/extract", json={"pdf_base64": sample_en16931_pdf_base64}) + assert response.status_code == 200 + data = response.json() + assert data["is_zugferd"] is True + assert data["zugferd_profil"] is not None + + def test_extract_invalid_base64(client): + response = client.post("/extract", json={"pdf_base64": "not-valid-base64!!!"}) + assert response.status_code == 400 + assert response.json()["error"] == "invalid_base64" + + def test_extract_non_pdf(client): + # Base64 of "Hello World" (not a PDF) + response = client.post("/extract", json={"pdf_base64": "SGVsbG8gV29ybGQ="}) + assert response.status_code == 400 + assert response.json()["error"] == "invalid_pdf" + ``` + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Extract endpoint integration test + Tool: Bash + Steps: + 1. pytest tests/test_api.py::test_extract -v + 2. Assert: All extract tests pass + Expected Result: Endpoint works correctly + Evidence: pytest output captured + + Scenario: Live extract endpoint test + Tool: Bash + Steps: + 1. uvicorn src.main:app --port 5003 & + 2. sleep 2 + 3. PDF_BASE64=$(base64 -w 0 tests/fixtures/sample_en16931.pdf) + 4. curl -s -X POST http://localhost:5003/extract \ + -H "Content-Type: application/json" \ + -d "{\"pdf_base64\": \"$PDF_BASE64\"}" | jq '.is_zugferd' + 5. Assert: Output is "true" + 6. kill %1 + Expected Result: ZUGFeRD detected + Evidence: curl output captured + ``` + + **Commit**: YES + - Message: `feat(api): implement /extract endpoint for PDF processing` + - Files: `src/main.py`, `tests/test_api.py` + - Pre-commit: `pytest tests/test_api.py::test_extract` + +--- + +- [ ] 11. Validate Endpoint Implementation (TDD) + + **What to do**: + - Write integration tests for `/validate` endpoint + - Implement `POST /validate` endpoint + - Accept xml_data, pdf_text, and checks array + - Use validator module for validation + - Return structured ValidateResponse + + **Must NOT do**: + - Do NOT run checks not in the request's checks array + + **Recommended Agent Profile**: + - **Category**: `unspecified-high` + - Reason: Integration of validator into API endpoint + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 4 (with Tasks 10, 12) + - **Blocks**: Task 13 + - **Blocked By**: Tasks 7, 8, 9 + + **References**: + - **User Spec**: /validate request/response format + - **Pattern References**: src/validator.py + + **Acceptance Criteria**: + + **TDD - Write Tests First:** + ```python + def test_validate_all_checks(client): + response = client.post("/validate", json={ + "xml_data": {...}, + "pdf_text": "...", + "checks": ["pflichtfelder", "betraege", "ustid", "pdf_abgleich"] + }) + assert response.status_code == 200 + data = response.json() + assert "is_valid" in data + assert "errors" in data + assert "summary" in data + + def test_validate_partial_checks(client): + response = client.post("/validate", json={ + "xml_data": {...}, + "checks": ["pflichtfelder"] + }) + assert response.status_code == 200 + # Only pflichtfelder check should run + ``` + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Validate endpoint integration test + Tool: Bash + Steps: + 1. pytest tests/test_api.py::test_validate -v + 2. Assert: All validate tests pass + Expected Result: Endpoint works correctly + Evidence: pytest output captured + + Scenario: Live validate endpoint with invalid data + Tool: Bash + Steps: + 1. uvicorn src.main:app --port 5004 & + 2. sleep 2 + 3. curl -s -X POST http://localhost:5004/validate \ + -H "Content-Type: application/json" \ + -d '{"xml_data": {"invoice_number": null}, "checks": ["pflichtfelder"]}' | jq '.is_valid' + 4. Assert: Output is "false" + 5. kill %1 + Expected Result: Validation detects missing field + Evidence: curl output captured + ``` + + **Commit**: YES + - Message: `feat(api): implement /validate endpoint for invoice validation` + - Files: `src/main.py`, `tests/test_api.py` + - Pre-commit: `pytest tests/test_api.py::test_validate` + +--- + +- [ ] 12. Error Handling Middleware + + **What to do**: + - Implement exception handlers for all error types + - Map exceptions to HTTP status codes and error responses + - Ensure all errors return JSON format + - Add request logging + + **Must NOT do**: + - Do NOT expose stack traces in production + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Standard FastAPI error handling + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 4 (with Tasks 10, 11) + - **Blocks**: Task 13 + - **Blocked By**: Task 8 + + **References**: + - **User Spec**: Error handling table + + **Error Mapping from Spec**: + | Error | Status | error_code | + |-------|--------|------------| + | Invalid JSON | 400 | invalid_json | + | Not a PDF | 400 | invalid_pdf | + | PDF corrupt | 400 | corrupt_pdf | + | Base64 invalid | 400 | invalid_base64 | + | File too large | 400 | file_too_large | + | Password protected | 400 | password_protected_pdf | + | Internal error | 500 | internal_error | + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Error responses are JSON + Tool: Bash + Steps: + 1. uvicorn src.main:app --port 5005 & + 2. sleep 2 + 3. curl -s -X POST http://localhost:5005/extract \ + -H "Content-Type: application/json" \ + -d '{"pdf_base64": "invalid!!!"}' | jq '.error' + 4. Assert: Output is "invalid_base64" + 5. kill %1 + Expected Result: JSON error response + Evidence: curl output captured + ``` + + **Commit**: YES (groups with Task 10, 11) + - Message: `feat(api): add comprehensive error handling middleware` + - Files: `src/main.py` + +--- + +### Wave 5: Packaging + +- [ ] 13. Integration Tests + + **What to do**: + - Create end-to-end integration tests + - Test full workflow: extract → validate + - Test with all sample PDFs + - Test error scenarios + + **Must NOT do**: + - Do NOT create performance tests + + **Recommended Agent Profile**: + - **Category**: `unspecified-high` + - Reason: Integration testing requires comprehensive coverage + - **Skills**: [`systematic-debugging`] + + **Parallelization**: + - **Can Run In Parallel**: NO + - **Parallel Group**: Sequential (depends on all endpoints) + - **Blocks**: Tasks 14, 16 + - **Blocked By**: Tasks 10, 11, 12 + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Full integration test suite passes + Tool: Bash + Steps: + 1. pytest tests/ -v --tb=short + 2. Assert: All tests pass (exit code 0) + Expected Result: 100% test pass rate + Evidence: pytest output captured + ``` + + **Commit**: YES + - Message: `test(integration): add end-to-end integration tests` + - Files: `tests/test_integration.py` + - Pre-commit: `pytest tests/` + +--- + +- [ ] 14. Dockerfile Creation + + **What to do**: + - Create multi-stage Dockerfile as per spec + - Build stage: install dependencies + - Production stage: slim image with app + - Non-root user for security + - Expose port 5000 + + **Must NOT do**: + - Do NOT include dev dependencies in production image + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Standard Dockerfile from spec template + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 5 (with Task 15) + - **Blocks**: Task 16 + - **Blocked By**: Task 13 + + **References**: + - **User Spec**: Complete Dockerfile template provided + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Docker build succeeds + Tool: Bash + Steps: + 1. docker build -t zugferd-service:test . + 2. Assert: Exit code 0 + 3. docker images zugferd-service:test --format "{{.Size}}" + 4. Assert: Size < 500MB + Expected Result: Image builds and is reasonably sized + Evidence: Build output captured + + Scenario: Container runs and responds + Tool: Bash + Steps: + 1. docker run -d --name zugferd-test -p 5006:5000 zugferd-service:test + 2. sleep 3 + 3. curl -s http://localhost:5006/health | jq '.status' + 4. Assert: Output is "healthy" + 5. docker stop zugferd-test && docker rm zugferd-test + Expected Result: Container is functional + Evidence: curl output captured + ``` + + **Commit**: YES + - Message: `build(docker): add multi-stage Dockerfile for production` + - Files: `Dockerfile` + - Pre-commit: `docker build -t zugferd-service:test .` + +--- + +- [ ] 15. Docker Compose Configuration + + **What to do**: + - Create docker-compose.yml for local development + - Include volume mount for live reload + - Configure environment variables + - Add health check + + **Must NOT do**: + - Do NOT add additional services (no DB, no cache) + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple docker-compose setup + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 5 (with Task 14) + - **Blocks**: None + - **Blocked By**: Task 13 + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Docker Compose starts service + Tool: Bash + Steps: + 1. docker-compose up -d + 2. sleep 5 + 3. curl -s http://localhost:5000/health | jq '.status' + 4. Assert: Output is "healthy" + 5. docker-compose down + Expected Result: Compose setup works + Evidence: curl output captured + ``` + + **Commit**: YES + - Message: `build(docker): add docker-compose.yml for local development` + - Files: `docker-compose.yml` + +--- + +### Wave 6: Nix Packaging + +- [ ] 16. Nix Flake Packaging + + **What to do**: + - Create flake.nix with buildPythonApplication + - Use pythonRelaxDeps for dependency flexibility + - Include devShell for development + - Test with nix build and nix run + + **Must NOT do**: + - Do NOT create complex overlay structure + + **Recommended Agent Profile**: + - **Category**: `unspecified-high` + - Reason: Nix packaging requires careful dependency handling + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: NO + - **Parallel Group**: Sequential + - **Blocks**: Tasks 17, 18 + - **Blocked By**: Task 14 + + **References**: + - **Librarian Research**: mem0 Nix packaging pattern + - **External References**: https://github.com/mem0ai/mem0 Nix package + + **flake.nix Structure**: + ```nix + { + description = "ZUGFeRD REST API Service"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + pythonPackages = pkgs.python311Packages; + + zugferd-service = pythonPackages.buildPythonApplication { + pname = "zugferd-service"; + version = "1.0.0"; + pyproject = true; + src = ./.; + + pythonRelaxDeps = true; + + build-system = [ pythonPackages.hatchling ]; + + dependencies = with pythonPackages; [ + fastapi + uvicorn + pydantic + python-multipart + # factur-x - may need packaging + pypdf + lxml + ]; + + nativeCheckInputs = with pythonPackages; [ + pytestCheckHook + pytest-asyncio + httpx + ]; + + passthru = { + mainProgram = "zugferd-service"; + }; + + meta = { + description = "REST API for ZUGFeRD invoice extraction"; + license = pkgs.lib.licenses.mit; + }; + }; + in + { + packages.default = zugferd-service; + packages.zugferd-service = zugferd-service; + + devShells.default = pkgs.mkShell { + packages = [ + (pkgs.python311.withPackages (ps: with ps; [ + fastapi uvicorn pydantic pypdf lxml + pytest pytest-asyncio httpx + ])) + ]; + }; + } + ); + } + ``` + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: Nix flake builds successfully + Tool: Bash + Steps: + 1. nix build .#zugferd-service + 2. Assert: Exit code 0 + 3. ls -la result/bin/ + 4. Assert: zugferd-service binary exists + Expected Result: Nix package builds + Evidence: Build output captured + + Scenario: Nix package runs correctly + Tool: Bash + Steps: + 1. nix run .#zugferd-service & + 2. sleep 3 + 3. curl -s http://localhost:5000/health | jq '.status' + 4. Assert: Output is "healthy" + 5. kill %1 + Expected Result: Nix-built service runs + Evidence: curl output captured + + Scenario: Dev shell provides dependencies + Tool: Bash + Steps: + 1. nix develop -c python -c "import fastapi; import pypdf; print('OK')" + 2. Assert: Output is "OK" + Expected Result: Dev shell has all deps + Evidence: Command output captured + ``` + + **Commit**: YES + - Message: `build(nix): add flake.nix for Nix packaging` + - Files: `flake.nix` + - Pre-commit: `nix flake check` + +--- + +- [ ] 17. NixOS Service Module Example + + **What to do**: + - Create example NixOS module for deployment + - Include service configuration options + - Add systemd service definition + - Document usage in README + + **Must NOT do**: + - Do NOT create production-ready module (example only) + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Example module following standard patterns + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: NO + - **Parallel Group**: Sequential + - **Blocks**: Task 18 + - **Blocked By**: Task 16 + + **References**: + - **User Spec**: NixOS container configuration example + - **Librarian Research**: NixOS service module pattern + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: NixOS module syntax is valid + Tool: Bash + Steps: + 1. nix-instantiate --eval -E "import ./nix/module.nix" + 2. Assert: Exit code 0 or specific Nix evaluation output + Expected Result: Module parses correctly + Evidence: Nix output captured + ``` + + **Commit**: YES + - Message: `docs(nix): add example NixOS service module` + - Files: `nix/module.nix` + +--- + +- [ ] 18. README Documentation + + **What to do**: + - Create comprehensive README.md + - Include: Overview, Installation, Usage, API Reference + - Add examples for Docker, Nix, and direct Python usage + - Document all endpoints with curl examples + - Include troubleshooting section + + **Must NOT do**: + - Do NOT duplicate API spec (reference it) + + **Recommended Agent Profile**: + - **Category**: `writing` + - Reason: Documentation writing + - **Skills**: [] + + **Parallelization**: + - **Can Run In Parallel**: NO + - **Parallel Group**: Sequential (last task) + - **Blocks**: None + - **Blocked By**: Tasks 16, 17 + + **README Structure**: + ```markdown + # ZUGFeRD-Service + + ## Overview + ## Quick Start + ### Docker + ### Nix + ### Python (Development) + ## API Reference + ### GET /health + ### POST /extract + ### POST /validate + ## Configuration + ## NixOS Deployment + ## Development + ## Troubleshooting + ## License + ``` + + **Acceptance Criteria**: + + **Agent-Executed QA Scenarios:** + + ``` + Scenario: README contains all required sections + Tool: Bash + Steps: + 1. grep -c "## " README.md + 2. Assert: At least 8 sections + 3. grep "curl" README.md | wc -l + 4. Assert: At least 3 curl examples + Expected Result: Comprehensive documentation + Evidence: grep output captured + ``` + + **Commit**: YES + - Message: `docs: add comprehensive README with installation and usage guide` + - Files: `README.md` + +--- + +## Commit Strategy + +| After Task | Message | Files | Verification | +|------------|---------|-------|--------------| +| 1 | `feat(project): initialize ZUGFeRD service` | pyproject.toml, src/, tests/ | toml parse | +| 2 | `test(fixtures): add ZUGFeRD sample PDFs` | tests/fixtures/ | file exists | +| 3 | `feat(models): add Pydantic models` | src/models.py | pytest | +| 4 | `feat(extractor): implement extraction` | src/extractor.py | pytest | +| 5 | `feat(pdf-parser): implement PDF parsing` | src/pdf_parser.py | pytest | +| 6 | `feat(utils): add utilities` | src/utils.py | pytest | +| 7 | `feat(validator): implement validation` | src/validator.py | pytest | +| 8-9 | `feat(api): add FastAPI app + health` | src/main.py | curl | +| 10 | `feat(api): add /extract endpoint` | src/main.py | pytest | +| 11 | `feat(api): add /validate endpoint` | src/main.py | pytest | +| 12 | `feat(api): add error handling` | src/main.py | curl | +| 13 | `test(integration): add e2e tests` | tests/ | pytest | +| 14 | `build(docker): add Dockerfile` | Dockerfile | docker build | +| 15 | `build(docker): add docker-compose` | docker-compose.yml | compose up | +| 16 | `build(nix): add flake.nix` | flake.nix | nix build | +| 17 | `docs(nix): add NixOS module` | nix/module.nix | nix eval | +| 18 | `docs: add README` | README.md | grep check | + +--- + +## Success Criteria + +### Verification Commands +```bash +# All tests pass +pytest tests/ -v +# Expected: All tests pass (exit code 0) + +# Docker builds and runs +docker build -t zugferd-service . +docker run -p 5000:5000 zugferd-service & +curl http://localhost:5000/health +# Expected: {"status": "healthy", "version": "1.0.0"} + +# Nix builds and runs +nix build .#zugferd-service +./result/bin/zugferd-service & +curl http://localhost:5000/health +# Expected: {"status": "healthy", "version": "1.0.0"} + +# Extract endpoint works +PDF_BASE64=$(base64 -w 0 tests/fixtures/sample_en16931.pdf) +curl -X POST http://localhost:5000/extract \ + -H "Content-Type: application/json" \ + -d "{\"pdf_base64\": \"$PDF_BASE64\"}" | jq '.is_zugferd' +# Expected: true +``` + +### Final Checklist +- [ ] All 18 tasks completed +- [ ] All tests pass (pytest) +- [ ] Docker image builds (<500MB) +- [ ] Docker container runs and responds +- [ ] Nix flake builds without errors +- [ ] Nix package runs and responds +- [ ] All endpoints return expected responses +- [ ] README documents all features +- [ ] No "Must NOT Have" items present diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..52ee3f2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "zugferd-service" +version = "1.0.0" +description = "REST API for ZUGFeRD invoice extraction and validation" +requires-python = ">=3.11" +dependencies = [ + "fastapi>=0.109.0", + "uvicorn>=0.27.0", + "python-multipart>=0.0.6", + "factur-x>=2.5", + "pypdf>=4.0.0", + "pydantic>=2.5.0", + "lxml>=5.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "httpx>=0.27.0", +] + +[project.scripts] +zugferd-service = "src.main:run" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..1dc5bc7 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,3 @@ +"""ZUGFeRD Service Package.""" + +__version__ = "1.0.0" diff --git a/src/extractor.py b/src/extractor.py new file mode 100644 index 0000000..3add2d2 --- /dev/null +++ b/src/extractor.py @@ -0,0 +1,3 @@ +"""ZUGFeRD extraction module.""" + +pass diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..c582506 --- /dev/null +++ b/src/main.py @@ -0,0 +1,20 @@ +"""FastAPI application for ZUGFeRD invoice processing.""" + +import uvicorn +from fastapi import FastAPI + +app = FastAPI( + title="ZUGFeRD Service", + version="1.0.0", + description="REST API for ZUGFeRD invoice extraction and validation", +) + + +def run(host: str = "0.0.0.0", port: int = 5000) -> None: + """Run the FastAPI application. + + Args: + host: Host to bind to. + port: Port to listen on. + """ + uvicorn.run(app, host=host, port=port) diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..1d2ac3b --- /dev/null +++ b/src/models.py @@ -0,0 +1,3 @@ +"""Pydantic models for ZUGFeRD service.""" + +pass diff --git a/src/pdf_parser.py b/src/pdf_parser.py new file mode 100644 index 0000000..a31bccb --- /dev/null +++ b/src/pdf_parser.py @@ -0,0 +1,3 @@ +"""PDF text parsing module.""" + +pass diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..fd307a9 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,3 @@ +"""Utility functions for ZUGFeRD service.""" + +pass diff --git a/src/validator.py b/src/validator.py new file mode 100644 index 0000000..c43fac1 --- /dev/null +++ b/src/validator.py @@ -0,0 +1,3 @@ +"""Validation module for ZUGFeRD invoices.""" + +pass diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..b49798d --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for ZUGFeRD service.""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..ebde1d8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +"""Pytest configuration and shared fixtures.""" + +import pytest + + +@pytest.fixture +def sample_pdf_bytes(): + """Fixture providing sample PDF bytes for testing.""" + return b"%PDF-1.4\n%fake pdf content for testing\n%%EOF" diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep new file mode 100644 index 0000000..e69de29