zugferd-service/tests/test_extractor.py

"""Tests for ZUGFeRD extractor.

Tests are written following TDD: FAILING TESTS FIRST (RED phase),
then implementation makes them pass (GREEN phase).
"""

import pytest
import base64


class TestExtractionError:
    """Test ExtractionError exception class."""

    def test_extraction_error_initialization(self):
        """Test ExtractionError can be created with all fields."""
        from src.extractor import ExtractionError

        error = ExtractionError(
            error_code="corrupt_pdf",
            message="PDF is corrupted",
            details="Trailer not found",
        )
        assert error.error_code == "corrupt_pdf"
        assert error.message == "PDF is corrupted"
        assert error.details == "Trailer not found"

    def test_extraction_error_without_details(self):
        """Test ExtractionError can be created without details."""
        from src.extractor import ExtractionError

        error = ExtractionError(error_code="invalid_pdf", message="Not a PDF file")
        assert error.error_code == "invalid_pdf"
        assert error.message == "Not a PDF file"
        assert error.details == ""

    def test_extraction_error_is_exception(self):
        """Test ExtractionError is a proper exception."""
        from src.extractor import ExtractionError

        error = ExtractionError(error_code="file_too_large", message="File too large")
        assert isinstance(error, Exception)
        assert str(error) == "File too large"


class TestFileSizeValidation:
    """Test file size validation in extract_zugferd()."""

    def test_file_size_limit_exactly_10mb(self):
        """Test PDF exactly at 10MB limit is accepted."""
        from src.extractor import extract_zugferd, ExtractionError

        """Test PDF exactly at 10MB limit is accepted."""
        from src.extractor import extract_zugferd

        # 10MB = 10 * 1024 * 1024 bytes
        large_pdf = b"X" * (10 * 1024 * 1024)

        # Should raise file_too_large error
        with pytest.raises(ExtractionError) as exc_info:
            extract_zugferd(large_pdf)

        assert exc_info.value.error_code == "file_too_large"

    def test_file_size_limit_10mb_plus_one_byte(self):
        """Test PDF one byte over 10MB limit is rejected."""
        from src.extractor import extract_zugferd, ExtractionError

        # 10MB + 1 byte
        too_large = b"X" * (10 * 1024 * 1024 + 1)

        with pytest.raises(ExtractionError) as exc_info:
            extract_zugferd(too_large)

        assert exc_info.value.error_code == "file_too_large"

    def test_file_size_under_10mb_accepted(self):
        """Test PDF under 10MB is accepted for processing."""
        from src.extractor import extract_zugferd, ExtractionError

        # Small PDF (9MB)
        small_pdf = b"X" * (9 * 1024 * 1024)

        # Should process (even if invalid PDF, different error)
        try:
            extract_zugferd(small_pdf)
        except ExtractionError as e:
            # Different error is expected (not file_too_large)
            assert e.error_code != "file_too_large"


class TestNonZUGFeRDPDF:
    """Test extraction from PDF without ZUGFeRD XML."""

    def test_non_zugferd_pdf(self):
        """Test PDF without ZUGFeRD XML returns is_zugferd=False."""
        from src.extractor import extract_zugferd

        # Load non-ZUGFeRD sample PDF
        with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.is_zugferd is False
        assert result.zugferd_profil is None
        assert result.xml_raw is None
        assert result.xml_data is None
        assert result.pdf_text is not None
        assert len(result.pdf_text) > 0
        assert result.extraction_meta.pages >= 1
        assert result.extraction_meta.extraction_time_ms >= 0


class TestEN16931Extraction:
    """Test extraction from EN16931 profile PDF."""

    def test_extract_en16931_profile(self):
        """Test EN16931 PDF extraction detects correct profile."""
        from src.extractor import extract_zugferd

        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.is_zugferd is True
        assert result.zugferd_profil == "EN16931"
        assert result.xml_raw is not None
        assert len(result.xml_raw) > 0
        assert result.xml_data is not None
        assert result.pdf_text is not None
        assert result.extraction_meta.xml_attachment_name is not None
        assert result.extraction_meta.pages >= 1
        assert result.extraction_meta.extraction_time_ms >= 0

    def test_extract_all_required_fields(self):
        """Test all XmlData fields are populated from EN16931."""
        from src.extractor import extract_zugferd

        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.xml_data is not None
        xml_data = result.xml_data

        # Required fields
        assert xml_data.invoice_number is not None and len(xml_data.invoice_number) > 0
        assert xml_data.invoice_date is not None and len(xml_data.invoice_date) > 0
        assert xml_data.supplier is not None
        assert xml_data.buyer is not None
        assert xml_data.line_items is not None
        assert xml_data.totals is not None

        # Supplier fields
        assert xml_data.supplier.name is not None and len(xml_data.supplier.name) > 0

        # Buyer fields
        assert xml_data.buyer.name is not None and len(xml_data.buyer.name) > 0

        # Line items
        assert len(xml_data.line_items) > 0
        first_item = xml_data.line_items[0]
        assert first_item.position >= 1
        assert first_item.description is not None and len(first_item.description) > 0
        assert first_item.quantity > 0
        assert first_item.unit is not None and len(first_item.unit) > 0
        assert first_item.unit_price > 0
        assert first_item.line_total > 0

        # Totals
        assert xml_data.totals.line_total_sum > 0
        assert xml_data.totals.net > 0
        assert xml_data.totals.vat_total >= 0
        assert xml_data.totals.gross > 0


class TestErrorHandling:
    """Test error handling for various PDF issues."""

    def test_corrupt_pdf_raises_error(self):
        """Test corrupt PDF raises ExtractionError with correct code."""
        from src.extractor import extract_zugferd, ExtractionError

        # Invalid PDF data
        corrupt_pdf = b"NOT A PDF FILE AT ALL"

        with pytest.raises(ExtractionError) as exc_info:
            extract_zugferd(corrupt_pdf)

        # Should raise either corrupt_pdf or invalid_pdf
        assert exc_info.value.error_code in ["corrupt_pdf", "invalid_pdf"]

    def test_empty_pdf_raises_error(self):
        """Test empty PDF raises ExtractionError."""
        from src.extractor import extract_zugferd, ExtractionError

        with pytest.raises(ExtractionError):
            extract_zugferd(b"")

    def test_invalid_base64(self):
        """Test invalid base64 raises ExtractionError."""
        from src.extractor import extract_zugferd, ExtractionError

        # This would be called by API layer, but we can test the concept
        # Invalid PDF that's not valid base64-encoded
        try:
            invalid_base64 = b"$$$INVALID$$$"
            # If API layer decodes invalid base64, it gets error
            decoded = base64.b64decode(invalid_base64, validate=True)
            extract_zugferd(decoded)
        except (base64.binascii.Error, ValueError):
            # base64 error is expected
            pass
        except ExtractionError as e:
            # Or extraction error from invalid PDF
            assert e.error_code in ["invalid_pdf", "corrupt_pdf"]


class TestPDFTextExtraction:
    """Test PDF text extraction."""

    def test_pdf_text_extraction(self):
        """Test PDF text is extracted correctly."""
        from src.extractor import extract_zugferd

        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.pdf_text is not None
        assert len(result.pdf_text) > 0
        # Should contain some common German invoice terms
        text_lower = result.pdf_text.lower()
        # PDF text may contain invoice-related terms in German or English


class TestExtractionMeta:
    """Test extraction metadata."""

    def test_extraction_meta_populated(self):
        """Test extraction metadata is populated correctly."""
        from src.extractor import extract_zugferd

        with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.extraction_meta is not None
        assert result.extraction_meta.pages >= 1
        assert result.extraction_meta.extraction_time_ms >= 0

    def test_extraction_meta_non_zugferd(self):
        """Test extraction metadata for non-ZUGFeRD PDF."""
        from src.extractor import extract_zugferd

        with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.extraction_meta is not None
        assert result.extraction_meta.pages >= 1
        assert result.extraction_meta.extraction_time_ms >= 0
        assert result.extraction_meta.xml_attachment_name is None


class TestExtendedProfile:
    """Test extraction from EXTENDED profile PDF (if available)."""

    def test_extract_extended_profile(self):
        """Test EXTENDED PDF extraction detects correct profile."""
        from src.extractor import extract_zugferd

        with open("tests/fixtures/zugferd_2p1_EXTENDED_PDFA-3A.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.is_zugferd is True
        assert result.zugferd_profil == "EXTENDED"
        assert result.xml_data is not None


class TestZUGFeRDProfileVariations:
    """Test various ZUGFeRD profile detection."""

    def test_detect_basicwl_profile(self):
        """Test BASIC WL profile detection."""
        from src.extractor import extract_zugferd

        with open("tests/fixtures/validAvoir_FR_type380_BASICWL.pdf", "rb") as f:
            pdf_bytes = f.read()

        result = extract_zugferd(pdf_bytes)

        assert result.is_zugferd is True
        # Profile should be detected (BASIC, BASICWL, etc.)
        assert result.zugferd_profil is not None
        assert result.xml_data is not None