"""Tests for ZUGFeRD extractor. Tests are written following TDD: FAILING TESTS FIRST (RED phase), then implementation makes them pass (GREEN phase). """ import pytest import base64 class TestExtractionError: """Test ExtractionError exception class.""" def test_extraction_error_initialization(self): """Test ExtractionError can be created with all fields.""" from src.extractor import ExtractionError error = ExtractionError( error_code="corrupt_pdf", message="PDF is corrupted", details="Trailer not found", ) assert error.error_code == "corrupt_pdf" assert error.message == "PDF is corrupted" assert error.details == "Trailer not found" def test_extraction_error_without_details(self): """Test ExtractionError can be created without details.""" from src.extractor import ExtractionError error = ExtractionError(error_code="invalid_pdf", message="Not a PDF file") assert error.error_code == "invalid_pdf" assert error.message == "Not a PDF file" assert error.details == "" def test_extraction_error_is_exception(self): """Test ExtractionError is a proper exception.""" from src.extractor import ExtractionError error = ExtractionError(error_code="file_too_large", message="File too large") assert isinstance(error, Exception) assert str(error) == "File too large" class TestFileSizeValidation: """Test file size validation in extract_zugferd().""" def test_file_size_limit_exactly_10mb(self): """Test PDF exactly at 10MB limit is accepted.""" from src.extractor import extract_zugferd, ExtractionError """Test PDF exactly at 10MB limit is accepted.""" from src.extractor import extract_zugferd # 10MB = 10 * 1024 * 1024 bytes large_pdf = b"X" * (10 * 1024 * 1024) # Should raise file_too_large error with pytest.raises(ExtractionError) as exc_info: extract_zugferd(large_pdf) assert exc_info.value.error_code == "file_too_large" def test_file_size_limit_10mb_plus_one_byte(self): """Test PDF one byte over 10MB limit is rejected.""" from src.extractor import extract_zugferd, ExtractionError # 10MB + 1 byte too_large = b"X" * (10 * 1024 * 1024 + 1) with pytest.raises(ExtractionError) as exc_info: extract_zugferd(too_large) assert exc_info.value.error_code == "file_too_large" def test_file_size_under_10mb_accepted(self): """Test PDF under 10MB is accepted for processing.""" from src.extractor import extract_zugferd, ExtractionError # Small PDF (9MB) small_pdf = b"X" * (9 * 1024 * 1024) # Should process (even if invalid PDF, different error) try: extract_zugferd(small_pdf) except ExtractionError as e: # Different error is expected (not file_too_large) assert e.error_code != "file_too_large" class TestNonZUGFeRDPDF: """Test extraction from PDF without ZUGFeRD XML.""" def test_non_zugferd_pdf(self): """Test PDF without ZUGFeRD XML returns is_zugferd=False.""" from src.extractor import extract_zugferd # Load non-ZUGFeRD sample PDF with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.is_zugferd is False assert result.zugferd_profil is None assert result.xml_raw is None assert result.xml_data is None assert result.pdf_text is not None assert len(result.pdf_text) > 0 assert result.extraction_meta.pages >= 1 assert result.extraction_meta.extraction_time_ms >= 0 class TestEN16931Extraction: """Test extraction from EN16931 profile PDF.""" def test_extract_en16931_profile(self): """Test EN16931 PDF extraction detects correct profile.""" from src.extractor import extract_zugferd with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.is_zugferd is True assert result.zugferd_profil == "EN16931" assert result.xml_raw is not None assert len(result.xml_raw) > 0 assert result.xml_data is not None assert result.pdf_text is not None assert result.extraction_meta.xml_attachment_name is not None assert result.extraction_meta.pages >= 1 assert result.extraction_meta.extraction_time_ms >= 0 def test_extract_all_required_fields(self): """Test all XmlData fields are populated from EN16931.""" from src.extractor import extract_zugferd with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.xml_data is not None xml_data = result.xml_data # Required fields assert xml_data.invoice_number is not None and len(xml_data.invoice_number) > 0 assert xml_data.invoice_date is not None and len(xml_data.invoice_date) > 0 assert xml_data.supplier is not None assert xml_data.buyer is not None assert xml_data.line_items is not None assert xml_data.totals is not None # Supplier fields assert xml_data.supplier.name is not None and len(xml_data.supplier.name) > 0 # Buyer fields assert xml_data.buyer.name is not None and len(xml_data.buyer.name) > 0 # Line items assert len(xml_data.line_items) > 0 first_item = xml_data.line_items[0] assert first_item.position >= 1 assert first_item.description is not None and len(first_item.description) > 0 assert first_item.quantity > 0 assert first_item.unit is not None and len(first_item.unit) > 0 assert first_item.unit_price > 0 assert first_item.line_total > 0 # Totals assert xml_data.totals.line_total_sum > 0 assert xml_data.totals.net > 0 assert xml_data.totals.vat_total >= 0 assert xml_data.totals.gross > 0 class TestErrorHandling: """Test error handling for various PDF issues.""" def test_corrupt_pdf_raises_error(self): """Test corrupt PDF raises ExtractionError with correct code.""" from src.extractor import extract_zugferd, ExtractionError # Invalid PDF data corrupt_pdf = b"NOT A PDF FILE AT ALL" with pytest.raises(ExtractionError) as exc_info: extract_zugferd(corrupt_pdf) # Should raise either corrupt_pdf or invalid_pdf assert exc_info.value.error_code in ["corrupt_pdf", "invalid_pdf"] def test_empty_pdf_raises_error(self): """Test empty PDF raises ExtractionError.""" from src.extractor import extract_zugferd, ExtractionError with pytest.raises(ExtractionError): extract_zugferd(b"") def test_invalid_base64(self): """Test invalid base64 raises ExtractionError.""" from src.extractor import extract_zugferd, ExtractionError # This would be called by API layer, but we can test the concept # Invalid PDF that's not valid base64-encoded try: invalid_base64 = b"$$$INVALID$$$" # If API layer decodes invalid base64, it gets error decoded = base64.b64decode(invalid_base64, validate=True) extract_zugferd(decoded) except (base64.binascii.Error, ValueError): # base64 error is expected pass except ExtractionError as e: # Or extraction error from invalid PDF assert e.error_code in ["invalid_pdf", "corrupt_pdf"] class TestPDFTextExtraction: """Test PDF text extraction.""" def test_pdf_text_extraction(self): """Test PDF text is extracted correctly.""" from src.extractor import extract_zugferd with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.pdf_text is not None assert len(result.pdf_text) > 0 # Should contain some common German invoice terms text_lower = result.pdf_text.lower() # PDF text may contain invoice-related terms in German or English class TestExtractionMeta: """Test extraction metadata.""" def test_extraction_meta_populated(self): """Test extraction metadata is populated correctly.""" from src.extractor import extract_zugferd with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.extraction_meta is not None assert result.extraction_meta.pages >= 1 assert result.extraction_meta.extraction_time_ms >= 0 def test_extraction_meta_non_zugferd(self): """Test extraction metadata for non-ZUGFeRD PDF.""" from src.extractor import extract_zugferd with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.extraction_meta is not None assert result.extraction_meta.pages >= 1 assert result.extraction_meta.extraction_time_ms >= 0 assert result.extraction_meta.xml_attachment_name is None class TestExtendedProfile: """Test extraction from EXTENDED profile PDF (if available).""" def test_extract_extended_profile(self): """Test EXTENDED PDF extraction detects correct profile.""" from src.extractor import extract_zugferd with open("tests/fixtures/zugferd_2p1_EXTENDED_PDFA-3A.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.is_zugferd is True assert result.zugferd_profil == "EXTENDED" assert result.xml_data is not None class TestZUGFeRDProfileVariations: """Test various ZUGFeRD profile detection.""" def test_detect_basicwl_profile(self): """Test BASIC WL profile detection.""" from src.extractor import extract_zugferd with open("tests/fixtures/validAvoir_FR_type380_BASICWL.pdf", "rb") as f: pdf_bytes = f.read() result = extract_zugferd(pdf_bytes) assert result.is_zugferd is True # Profile should be detected (BASIC, BASICWL, etc.) assert result.zugferd_profil is not None assert result.xml_data is not None