feat(core): implement extractor, pdf_parser, and utils with TDD

Wave 2 tasks complete: - Task 4: ZUGFeRD extractor with profile detection (factur-x) - Task 5: PDF text parser with regex patterns - Task 6: Utils with unit code mapping and tolerance checks Features: - extract_zugferd() extracts XML and text from PDFs - parse_zugferd_xml() parses UN/CEFACT CII XML to models - extract_from_text() extracts values using regex patterns - translate_unit_code() maps UN/ECE codes to German - amounts_match() checks with 0.01 EUR tolerance - German number/date format handling Tests: 27 utils tests, 27 pdf_parser tests, extractor tests
2026-02-04 19:42:32 +01:00
parent 29bd8453ec
commit c1f603cd46
8 changed files with 1642 additions and 8 deletions
--- a/tests/test_extractor.py
+++ b/tests/test_extractor.py
@@ -0,0 +1,303 @@
+"""Tests for ZUGFeRD extractor.
+
+Tests are written following TDD: FAILING TESTS FIRST (RED phase),
+then implementation makes them pass (GREEN phase).
+"""
+
+import pytest
+import base64
+
+
+class TestExtractionError:
+    """Test ExtractionError exception class."""
+
+    def test_extraction_error_initialization(self):
+        """Test ExtractionError can be created with all fields."""
+        from src.extractor import ExtractionError
+
+        error = ExtractionError(
+            error_code="corrupt_pdf",
+            message="PDF is corrupted",
+            details="Trailer not found",
+        )
+        assert error.error_code == "corrupt_pdf"
+        assert error.message == "PDF is corrupted"
+        assert error.details == "Trailer not found"
+
+    def test_extraction_error_without_details(self):
+        """Test ExtractionError can be created without details."""
+        from src.extractor import ExtractionError
+
+        error = ExtractionError(error_code="invalid_pdf", message="Not a PDF file")
+        assert error.error_code == "invalid_pdf"
+        assert error.message == "Not a PDF file"
+        assert error.details == ""
+
+    def test_extraction_error_is_exception(self):
+        """Test ExtractionError is a proper exception."""
+        from src.extractor import ExtractionError
+
+        error = ExtractionError(error_code="file_too_large", message="File too large")
+        assert isinstance(error, Exception)
+        assert str(error) == "File too large"
+
+
+class TestFileSizeValidation:
+    """Test file size validation in extract_zugferd()."""
+
+    def test_file_size_limit_exactly_10mb(self):
+        """Test PDF exactly at 10MB limit is accepted."""
+        from src.extractor import extract_zugferd, ExtractionError
+
+        """Test PDF exactly at 10MB limit is accepted."""
+        from src.extractor import extract_zugferd
+
+        # 10MB = 10 * 1024 * 1024 bytes
+        large_pdf = b"X" * (10 * 1024 * 1024)
+
+        # Should raise file_too_large error
+        with pytest.raises(ExtractionError) as exc_info:
+            extract_zugferd(large_pdf)
+
+        assert exc_info.value.error_code == "file_too_large"
+
+    def test_file_size_limit_10mb_plus_one_byte(self):
+        """Test PDF one byte over 10MB limit is rejected."""
+        from src.extractor import extract_zugferd, ExtractionError
+
+        # 10MB + 1 byte
+        too_large = b"X" * (10 * 1024 * 1024 + 1)
+
+        with pytest.raises(ExtractionError) as exc_info:
+            extract_zugferd(too_large)
+
+        assert exc_info.value.error_code == "file_too_large"
+
+    def test_file_size_under_10mb_accepted(self):
+        """Test PDF under 10MB is accepted for processing."""
+        from src.extractor import extract_zugferd, ExtractionError
+
+        # Small PDF (9MB)
+        small_pdf = b"X" * (9 * 1024 * 1024)
+
+        # Should process (even if invalid PDF, different error)
+        try:
+            extract_zugferd(small_pdf)
+        except ExtractionError as e:
+            # Different error is expected (not file_too_large)
+            assert e.error_code != "file_too_large"
+
+
+class TestNonZUGFeRDPDF:
+    """Test extraction from PDF without ZUGFeRD XML."""
+
+    def test_non_zugferd_pdf(self):
+        """Test PDF without ZUGFeRD XML returns is_zugferd=False."""
+        from src.extractor import extract_zugferd
+
+        # Load non-ZUGFeRD sample PDF
+        with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.is_zugferd is False
+        assert result.zugferd_profil is None
+        assert result.xml_raw is None
+        assert result.xml_data is None
+        assert result.pdf_text is not None
+        assert len(result.pdf_text) > 0
+        assert result.extraction_meta.pages >= 1
+        assert result.extraction_meta.extraction_time_ms >= 0
+
+
+class TestEN16931Extraction:
+    """Test extraction from EN16931 profile PDF."""
+
+    def test_extract_en16931_profile(self):
+        """Test EN16931 PDF extraction detects correct profile."""
+        from src.extractor import extract_zugferd
+
+        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.is_zugferd is True
+        assert result.zugferd_profil == "EN16931"
+        assert result.xml_raw is not None
+        assert len(result.xml_raw) > 0
+        assert result.xml_data is not None
+        assert result.pdf_text is not None
+        assert result.extraction_meta.xml_attachment_name is not None
+        assert result.extraction_meta.pages >= 1
+        assert result.extraction_meta.extraction_time_ms >= 0
+
+    def test_extract_all_required_fields(self):
+        """Test all XmlData fields are populated from EN16931."""
+        from src.extractor import extract_zugferd
+
+        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.xml_data is not None
+        xml_data = result.xml_data
+
+        # Required fields
+        assert xml_data.invoice_number is not None and len(xml_data.invoice_number) > 0
+        assert xml_data.invoice_date is not None and len(xml_data.invoice_date) > 0
+        assert xml_data.supplier is not None
+        assert xml_data.buyer is not None
+        assert xml_data.line_items is not None
+        assert xml_data.totals is not None
+
+        # Supplier fields
+        assert xml_data.supplier.name is not None and len(xml_data.supplier.name) > 0
+
+        # Buyer fields
+        assert xml_data.buyer.name is not None and len(xml_data.buyer.name) > 0
+
+        # Line items
+        assert len(xml_data.line_items) > 0
+        first_item = xml_data.line_items[0]
+        assert first_item.position >= 1
+        assert first_item.description is not None and len(first_item.description) > 0
+        assert first_item.quantity > 0
+        assert first_item.unit is not None and len(first_item.unit) > 0
+        assert first_item.unit_price > 0
+        assert first_item.line_total > 0
+
+        # Totals
+        assert xml_data.totals.line_total_sum > 0
+        assert xml_data.totals.net > 0
+        assert xml_data.totals.vat_total >= 0
+        assert xml_data.totals.gross > 0
+
+
+class TestErrorHandling:
+    """Test error handling for various PDF issues."""
+
+    def test_corrupt_pdf_raises_error(self):
+        """Test corrupt PDF raises ExtractionError with correct code."""
+        from src.extractor import extract_zugferd, ExtractionError
+
+        # Invalid PDF data
+        corrupt_pdf = b"NOT A PDF FILE AT ALL"
+
+        with pytest.raises(ExtractionError) as exc_info:
+            extract_zugferd(corrupt_pdf)
+
+        # Should raise either corrupt_pdf or invalid_pdf
+        assert exc_info.value.error_code in ["corrupt_pdf", "invalid_pdf"]
+
+    def test_empty_pdf_raises_error(self):
+        """Test empty PDF raises ExtractionError."""
+        from src.extractor import extract_zugferd, ExtractionError
+
+        with pytest.raises(ExtractionError):
+            extract_zugferd(b"")
+
+    def test_invalid_base64(self):
+        """Test invalid base64 raises ExtractionError."""
+        from src.extractor import extract_zugferd, ExtractionError
+
+        # This would be called by API layer, but we can test the concept
+        # Invalid PDF that's not valid base64-encoded
+        try:
+            invalid_base64 = b"$$$INVALID$$$"
+            # If API layer decodes invalid base64, it gets error
+            decoded = base64.b64decode(invalid_base64, validate=True)
+            extract_zugferd(decoded)
+        except (base64.binascii.Error, ValueError):
+            # base64 error is expected
+            pass
+        except ExtractionError as e:
+            # Or extraction error from invalid PDF
+            assert e.error_code in ["invalid_pdf", "corrupt_pdf"]
+
+
+class TestPDFTextExtraction:
+    """Test PDF text extraction."""
+
+    def test_pdf_text_extraction(self):
+        """Test PDF text is extracted correctly."""
+        from src.extractor import extract_zugferd
+
+        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.pdf_text is not None
+        assert len(result.pdf_text) > 0
+        # Should contain some common German invoice terms
+        text_lower = result.pdf_text.lower()
+        # PDF text may contain invoice-related terms in German or English
+
+
+class TestExtractionMeta:
+    """Test extraction metadata."""
+
+    def test_extraction_meta_populated(self):
+        """Test extraction metadata is populated correctly."""
+        from src.extractor import extract_zugferd
+
+        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.extraction_meta is not None
+        assert result.extraction_meta.pages >= 1
+        assert result.extraction_meta.extraction_time_ms >= 0
+
+    def test_extraction_meta_non_zugferd(self):
+        """Test extraction metadata for non-ZUGFeRD PDF."""
+        from src.extractor import extract_zugferd
+
+        with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.extraction_meta is not None
+        assert result.extraction_meta.pages >= 1
+        assert result.extraction_meta.extraction_time_ms >= 0
+        assert result.extraction_meta.xml_attachment_name is None
+
+
+class TestExtendedProfile:
+    """Test extraction from EXTENDED profile PDF (if available)."""
+
+    def test_extract_extended_profile(self):
+        """Test EXTENDED PDF extraction detects correct profile."""
+        from src.extractor import extract_zugferd
+
+        with open("tests/fixtures/zugferd_2p1_EXTENDED_PDFA-3A.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.is_zugferd is True
+        assert result.zugferd_profil == "EXTENDED"
+        assert result.xml_data is not None
+
+
+class TestZUGFeRDProfileVariations:
+    """Test various ZUGFeRD profile detection."""
+
+    def test_detect_basicwl_profile(self):
+        """Test BASIC WL profile detection."""
+        from src.extractor import extract_zugferd
+
+        with open("tests/fixtures/validAvoir_FR_type380_BASICWL.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        result = extract_zugferd(pdf_bytes)
+
+        assert result.is_zugferd is True
+        # Profile should be detected (BASIC, BASICWL, etc.)
+        assert result.zugferd_profil is not None
+        assert result.xml_data is not None