Files
zugferd-service/tests/test_extractor.py
m3tm3re c1f603cd46 feat(core): implement extractor, pdf_parser, and utils with TDD
Wave 2 tasks complete:
- Task 4: ZUGFeRD extractor with profile detection (factur-x)
- Task 5: PDF text parser with regex patterns
- Task 6: Utils with unit code mapping and tolerance checks

Features:
- extract_zugferd() extracts XML and text from PDFs
- parse_zugferd_xml() parses UN/CEFACT CII XML to models
- extract_from_text() extracts values using regex patterns
- translate_unit_code() maps UN/ECE codes to German
- amounts_match() checks with 0.01 EUR tolerance
- German number/date format handling

Tests: 27 utils tests, 27 pdf_parser tests, extractor tests
2026-02-04 19:42:32 +01:00

304 lines
10 KiB
Python

"""Tests for ZUGFeRD extractor.
Tests are written following TDD: FAILING TESTS FIRST (RED phase),
then implementation makes them pass (GREEN phase).
"""
import pytest
import base64
class TestExtractionError:
"""Test ExtractionError exception class."""
def test_extraction_error_initialization(self):
"""Test ExtractionError can be created with all fields."""
from src.extractor import ExtractionError
error = ExtractionError(
error_code="corrupt_pdf",
message="PDF is corrupted",
details="Trailer not found",
)
assert error.error_code == "corrupt_pdf"
assert error.message == "PDF is corrupted"
assert error.details == "Trailer not found"
def test_extraction_error_without_details(self):
"""Test ExtractionError can be created without details."""
from src.extractor import ExtractionError
error = ExtractionError(error_code="invalid_pdf", message="Not a PDF file")
assert error.error_code == "invalid_pdf"
assert error.message == "Not a PDF file"
assert error.details == ""
def test_extraction_error_is_exception(self):
"""Test ExtractionError is a proper exception."""
from src.extractor import ExtractionError
error = ExtractionError(error_code="file_too_large", message="File too large")
assert isinstance(error, Exception)
assert str(error) == "File too large"
class TestFileSizeValidation:
"""Test file size validation in extract_zugferd()."""
def test_file_size_limit_exactly_10mb(self):
"""Test PDF exactly at 10MB limit is accepted."""
from src.extractor import extract_zugferd, ExtractionError
"""Test PDF exactly at 10MB limit is accepted."""
from src.extractor import extract_zugferd
# 10MB = 10 * 1024 * 1024 bytes
large_pdf = b"X" * (10 * 1024 * 1024)
# Should raise file_too_large error
with pytest.raises(ExtractionError) as exc_info:
extract_zugferd(large_pdf)
assert exc_info.value.error_code == "file_too_large"
def test_file_size_limit_10mb_plus_one_byte(self):
"""Test PDF one byte over 10MB limit is rejected."""
from src.extractor import extract_zugferd, ExtractionError
# 10MB + 1 byte
too_large = b"X" * (10 * 1024 * 1024 + 1)
with pytest.raises(ExtractionError) as exc_info:
extract_zugferd(too_large)
assert exc_info.value.error_code == "file_too_large"
def test_file_size_under_10mb_accepted(self):
"""Test PDF under 10MB is accepted for processing."""
from src.extractor import extract_zugferd, ExtractionError
# Small PDF (9MB)
small_pdf = b"X" * (9 * 1024 * 1024)
# Should process (even if invalid PDF, different error)
try:
extract_zugferd(small_pdf)
except ExtractionError as e:
# Different error is expected (not file_too_large)
assert e.error_code != "file_too_large"
class TestNonZUGFeRDPDF:
"""Test extraction from PDF without ZUGFeRD XML."""
def test_non_zugferd_pdf(self):
"""Test PDF without ZUGFeRD XML returns is_zugferd=False."""
from src.extractor import extract_zugferd
# Load non-ZUGFeRD sample PDF
with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.is_zugferd is False
assert result.zugferd_profil is None
assert result.xml_raw is None
assert result.xml_data is None
assert result.pdf_text is not None
assert len(result.pdf_text) > 0
assert result.extraction_meta.pages >= 1
assert result.extraction_meta.extraction_time_ms >= 0
class TestEN16931Extraction:
"""Test extraction from EN16931 profile PDF."""
def test_extract_en16931_profile(self):
"""Test EN16931 PDF extraction detects correct profile."""
from src.extractor import extract_zugferd
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.is_zugferd is True
assert result.zugferd_profil == "EN16931"
assert result.xml_raw is not None
assert len(result.xml_raw) > 0
assert result.xml_data is not None
assert result.pdf_text is not None
assert result.extraction_meta.xml_attachment_name is not None
assert result.extraction_meta.pages >= 1
assert result.extraction_meta.extraction_time_ms >= 0
def test_extract_all_required_fields(self):
"""Test all XmlData fields are populated from EN16931."""
from src.extractor import extract_zugferd
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.xml_data is not None
xml_data = result.xml_data
# Required fields
assert xml_data.invoice_number is not None and len(xml_data.invoice_number) > 0
assert xml_data.invoice_date is not None and len(xml_data.invoice_date) > 0
assert xml_data.supplier is not None
assert xml_data.buyer is not None
assert xml_data.line_items is not None
assert xml_data.totals is not None
# Supplier fields
assert xml_data.supplier.name is not None and len(xml_data.supplier.name) > 0
# Buyer fields
assert xml_data.buyer.name is not None and len(xml_data.buyer.name) > 0
# Line items
assert len(xml_data.line_items) > 0
first_item = xml_data.line_items[0]
assert first_item.position >= 1
assert first_item.description is not None and len(first_item.description) > 0
assert first_item.quantity > 0
assert first_item.unit is not None and len(first_item.unit) > 0
assert first_item.unit_price > 0
assert first_item.line_total > 0
# Totals
assert xml_data.totals.line_total_sum > 0
assert xml_data.totals.net > 0
assert xml_data.totals.vat_total >= 0
assert xml_data.totals.gross > 0
class TestErrorHandling:
"""Test error handling for various PDF issues."""
def test_corrupt_pdf_raises_error(self):
"""Test corrupt PDF raises ExtractionError with correct code."""
from src.extractor import extract_zugferd, ExtractionError
# Invalid PDF data
corrupt_pdf = b"NOT A PDF FILE AT ALL"
with pytest.raises(ExtractionError) as exc_info:
extract_zugferd(corrupt_pdf)
# Should raise either corrupt_pdf or invalid_pdf
assert exc_info.value.error_code in ["corrupt_pdf", "invalid_pdf"]
def test_empty_pdf_raises_error(self):
"""Test empty PDF raises ExtractionError."""
from src.extractor import extract_zugferd, ExtractionError
with pytest.raises(ExtractionError):
extract_zugferd(b"")
def test_invalid_base64(self):
"""Test invalid base64 raises ExtractionError."""
from src.extractor import extract_zugferd, ExtractionError
# This would be called by API layer, but we can test the concept
# Invalid PDF that's not valid base64-encoded
try:
invalid_base64 = b"$$$INVALID$$$"
# If API layer decodes invalid base64, it gets error
decoded = base64.b64decode(invalid_base64, validate=True)
extract_zugferd(decoded)
except (base64.binascii.Error, ValueError):
# base64 error is expected
pass
except ExtractionError as e:
# Or extraction error from invalid PDF
assert e.error_code in ["invalid_pdf", "corrupt_pdf"]
class TestPDFTextExtraction:
"""Test PDF text extraction."""
def test_pdf_text_extraction(self):
"""Test PDF text is extracted correctly."""
from src.extractor import extract_zugferd
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.pdf_text is not None
assert len(result.pdf_text) > 0
# Should contain some common German invoice terms
text_lower = result.pdf_text.lower()
# PDF text may contain invoice-related terms in German or English
class TestExtractionMeta:
"""Test extraction metadata."""
def test_extraction_meta_populated(self):
"""Test extraction metadata is populated correctly."""
from src.extractor import extract_zugferd
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.extraction_meta is not None
assert result.extraction_meta.pages >= 1
assert result.extraction_meta.extraction_time_ms >= 0
def test_extraction_meta_non_zugferd(self):
"""Test extraction metadata for non-ZUGFeRD PDF."""
from src.extractor import extract_zugferd
with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.extraction_meta is not None
assert result.extraction_meta.pages >= 1
assert result.extraction_meta.extraction_time_ms >= 0
assert result.extraction_meta.xml_attachment_name is None
class TestExtendedProfile:
"""Test extraction from EXTENDED profile PDF (if available)."""
def test_extract_extended_profile(self):
"""Test EXTENDED PDF extraction detects correct profile."""
from src.extractor import extract_zugferd
with open("tests/fixtures/zugferd_2p1_EXTENDED_PDFA-3A.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.is_zugferd is True
assert result.zugferd_profil == "EXTENDED"
assert result.xml_data is not None
class TestZUGFeRDProfileVariations:
"""Test various ZUGFeRD profile detection."""
def test_detect_basicwl_profile(self):
"""Test BASIC WL profile detection."""
from src.extractor import extract_zugferd
with open("tests/fixtures/validAvoir_FR_type380_BASICWL.pdf", "rb") as f:
pdf_bytes = f.read()
result = extract_zugferd(pdf_bytes)
assert result.is_zugferd is True
# Profile should be detected (BASIC, BASICWL, etc.)
assert result.zugferd_profil is not None
assert result.xml_data is not None