feat(core): implement extractor, pdf_parser, and utils with TDD

Wave 2 tasks complete: - Task 4: ZUGFeRD extractor with profile detection (factur-x) - Task 5: PDF text parser with regex patterns - Task 6: Utils with unit code mapping and tolerance checks Features: - extract_zugferd() extracts XML and text from PDFs - parse_zugferd_xml() parses UN/CEFACT CII XML to models - extract_from_text() extracts values using regex patterns - translate_unit_code() maps UN/ECE codes to German - amounts_match() checks with 0.01 EUR tolerance - German number/date format handling Tests: 27 utils tests, 27 pdf_parser tests, extractor tests
2026-02-04 19:42:32 +01:00
parent 29bd8453ec
commit c1f603cd46
8 changed files with 1642 additions and 8 deletions
--- a/tests/test_pdf_parser.py
+++ b/tests/test_pdf_parser.py
@@ -0,0 +1,308 @@
+"""
+Unit tests for PDF text extraction and parsing.
+
+TDD approach: Tests written first, implementation follows.
+"""
+
+import pytest
+from src.pdf_parser import extract_text_from_pdf, extract_from_text
+
+
+class TestExtractTextFromPDF:
+    """Test PDF text extraction using pypdf."""
+
+    def test_extract_text_from_sample_pdf(self):
+        """Extract text from EN16931_Einfach.pdf sample."""
+        # Load the test PDF
+        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        # Extract text
+        text = extract_text_from_pdf(pdf_bytes)
+
+        # Verify text was extracted
+        assert text is not None
+        assert len(text) > 0
+
+        # Verify key content is present
+        assert "Lieferant GmbH" in text
+        assert "Rechnung" in text
+
+    def test_extract_text_from_empty_pdf(self):
+        """Handle empty PDF gracefully."""
+        with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        text = extract_text_from_pdf(pdf_bytes)
+
+        # Should return empty string or minimal content
+        assert isinstance(text, str)
+
+    def test_extract_text_from_invalid_pdf(self):
+        """Handle invalid PDF bytes gracefully."""
+        invalid_pdf = b"Not a valid PDF"
+
+        # Should raise an appropriate error
+        with pytest.raises(Exception):
+            extract_text_from_pdf(invalid_pdf)
+
+
+class TestExtractFromText:
+    """Test invoice field extraction from text using regex patterns."""
+
+    def test_extract_invoice_number_german(self):
+        """Extract German invoice number format."""
+        text = "Rechnungs-Nr: RE-2025-001234"
+
+        result = extract_from_text(text)
+
+        assert "invoice_number" in result
+        assert result["invoice_number"] == "RE-2025-001234"
+        assert "invoice_number_confidence" in result
+        assert result["invoice_number_confidence"] > 0.8
+
+    def test_extract_invoice_number_english(self):
+        """Extract English invoice number format."""
+        text = "Invoice No: INV-2025-001234"
+
+        result = extract_from_text(text)
+
+        assert "invoice_number" in result
+        assert result["invoice_number"] == "INV-2025-001234"
+
+    def test_extract_invoice_number_beleg(self):
+        """Extract Beleg-Nr format."""
+        text = "Beleg-Nr: 471102"
+
+        result = extract_from_text(text)
+
+        assert "invoice_number" in result
+        assert result["invoice_number"] == "471102"
+
+    def test_extract_invoice_date_german(self):
+        """Extract German date format and convert to ISO."""
+        text = "Rechnungsdatum: 04.02.2025"
+
+        result = extract_from_text(text)
+
+        assert "invoice_date" in result
+        assert result["invoice_date"] == "2025-02-04"
+
+    def test_extract_invoice_date_iso(self):
+        """Extract ISO date format."""
+        text = "Invoice Date: 2025-02-04"
+
+        result = extract_from_text(text)
+
+        assert "invoice_date" in result
+        assert result["invoice_date"] == "2025-02-04"
+
+    def test_extract_gross_amount_german(self):
+        """Extract gross amount with German format."""
+        text = "Brutto: 1.234,56 EUR"
+
+        result = extract_from_text(text)
+
+        assert "gross_amount" in result
+        assert result["gross_amount"] == 1234.56
+        assert "gross_amount_confidence" in result
+
+    def test_extract_gross_amount_variations(self):
+        """Test various gross amount labels."""
+        variations = [
+            ("Brutto: 118,88", 118.88),
+            ("Gesamtbetrag: 118,88 EUR", 118.88),
+            ("Total: 118.88", 118.88),
+            ("Endbetrag: 529,87", 529.87),
+            ("Summe: 100,00", 100.00),
+        ]
+
+        for text, expected in variations:
+            result = extract_from_text(text)
+            assert "gross_amount" in result
+            assert result["gross_amount"] == expected
+
+    def test_extract_net_amount(self):
+        """Extract net amount."""
+        text = "Netto: 100,00 EUR"
+
+        result = extract_from_text(text)
+
+        assert "net_amount" in result
+        assert result["net_amount"] == 100.00
+        assert "net_amount_confidence" in result
+
+    def test_extract_net_amount_rechnungsbetrag(self):
+        """Extract net amount with alternative label."""
+        text = "Rechnungsbetrag: 473,00"
+
+        result = extract_from_text(text)
+
+        assert "net_amount" in result
+        assert result["net_amount"] == 473.00
+
+    def test_extract_vat_amount(self):
+        """Extract VAT amount."""
+        text = "MwSt: 18,88 EUR"
+
+        result = extract_from_text(text)
+
+        assert "vat_amount" in result
+        assert result["vat_amount"] == 18.88
+        assert "vat_amount_confidence" in result
+
+    def test_extract_vat_amount_variations(self):
+        """Test various VAT amount labels."""
+        variations = [
+            ("MwSt: 56,87", 56.87),
+            ("USt: 18,88 EUR", 18.88),
+            ("Steuer: 19,00", 19.00),
+        ]
+
+        for text, expected in variations:
+            result = extract_from_text(text)
+            assert "vat_amount" in result
+            assert result["vat_amount"] == expected
+
+    def test_extract_supplier_name(self):
+        """Extract supplier name."""
+        text = "Lieferant: Lieferant GmbH"
+
+        result = extract_from_text(text)
+
+        assert "supplier_name" in result
+        assert result["supplier_name"] == "Lieferant GmbH"
+
+    def test_extract_supplier_name_verkaeufer(self):
+        """Extract supplier with Verkäufer label."""
+        text = "Verkäufer: ACME Corporation Inc."
+
+        result = extract_from_text(text)
+
+        assert "supplier_name" in result
+        assert result["supplier_name"] == "ACME Corporation Inc."
+
+    def test_extract_all_fields_comprehensive(self):
+        """Extract all fields from realistic invoice text."""
+        text = """
+        Rechnungs-Nr: RE-2025-001234
+        Rechnungsdatum: 04.02.2025
+        Lieferant: Lieferant GmbH
+        Netto: 100,00 EUR
+        MwSt: 18,88 EUR
+        Brutto: 118,88 EUR
+        """
+
+        result = extract_from_text(text)
+
+        assert result["invoice_number"] == "RE-2025-001234"
+        assert result["invoice_date"] == "2025-02-04"
+        assert result["supplier_name"] == "Lieferant GmbH"
+        assert result["net_amount"] == 100.00
+        assert result["vat_amount"] == 18.88
+        assert result["gross_amount"] == 118.88
+
+    def test_confidence_scores_in_range(self):
+        """Verify all confidence scores are in 0.0-1.0 range."""
+        text = """
+        Rechnungs-Nr: RE-2025-001234
+        Rechnungsdatum: 04.02.2025
+        Lieferant: Lieferant GmbH
+        Netto: 100,00 EUR
+        MwSt: 18,88 EUR
+        Brutto: 118,88 EUR
+        """
+
+        result = extract_from_text(text)
+
+        confidence_fields = [k for k in result.keys() if k.endswith("_confidence")]
+
+        for field in confidence_fields:
+            assert isinstance(result[field], (int, float))
+            assert 0.0 <= result[field] <= 1.0
+
+    def test_empty_text(self):
+        """Handle empty input text gracefully."""
+        result = extract_from_text("")
+
+        # Should return empty dict or dict with None values
+        assert isinstance(result, dict)
+
+    def test_no_matches(self):
+        """Handle text with no matches."""
+        text = "This is just random text with no invoice data."
+
+        result = extract_from_text(text)
+
+        assert isinstance(result, dict)
+        # Values should be None or missing
+
+
+class TestGermanNumberFormat:
+    """Test German number format conversion."""
+
+    def test_simple_decimal(self):
+        """Convert simple German decimal: 123,45"""
+        text = "Brutto: 123,45"
+        result = extract_from_text(text)
+        assert result["gross_amount"] == 123.45
+
+    def test_thousands_separator(self):
+        """Convert with thousands: 1.234,56"""
+        text = "Brutto: 1.234,56"
+        result = extract_from_text(text)
+        assert result["gross_amount"] == 1234.56
+
+    def test_large_amount(self):
+        """Convert large amount: 10.000,00"""
+        text = "Brutto: 10.000,00"
+        result = extract_from_text(text)
+        assert result["gross_amount"] == 10000.00
+
+    def test_integer_amount(self):
+        """Convert integer: 100,00"""
+        text = "Netto: 100,00"
+        result = extract_from_text(text)
+        assert result["net_amount"] == 100.00
+
+
+class TestGermanDateFormat:
+    """Test German date format conversion."""
+
+    def test_dd_mm_yyyy(self):
+        """Convert DD.MM.YYYY to ISO format."""
+        text = "Rechnungsdatum: 15.11.2024"
+        result = extract_from_text(text)
+        assert result["invoice_date"] == "2024-11-15"
+
+    def test_d_m_yyyy(self):
+        """Convert D.M.YYYY (single digits) to ISO format."""
+        text = "Rechnungsdatum: 4.2.2025"
+        result = extract_from_text(text)
+        assert result["invoice_date"] == "2025-02-04"
+
+
+class TestRealPDFExtraction:
+    """Test extraction from actual PDF fixtures."""
+
+    def test_extract_from_en16931_sample(self):
+        """Extract fields from EN16931_Einfach.pdf."""
+        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        # Extract text
+        text = extract_text_from_pdf(pdf_bytes)
+
+        # Extract fields
+        result = extract_from_text(text)
+
+        # Verify key fields were found
+        assert result is not None
+        # Check if at least some fields were extracted
+        # (exact values may vary based on PDF layout)
+        extracted_fields = [
+            k
+            for k, v in result.items()
+            if v is not None and not k.endswith("_confidence")
+        ]
+        assert len(extracted_fields) > 0