Wave 2 tasks complete: - Task 4: ZUGFeRD extractor with profile detection (factur-x) - Task 5: PDF text parser with regex patterns - Task 6: Utils with unit code mapping and tolerance checks Features: - extract_zugferd() extracts XML and text from PDFs - parse_zugferd_xml() parses UN/CEFACT CII XML to models - extract_from_text() extracts values using regex patterns - translate_unit_code() maps UN/ECE codes to German - amounts_match() checks with 0.01 EUR tolerance - German number/date format handling Tests: 27 utils tests, 27 pdf_parser tests, extractor tests
304 lines
10 KiB
Python
304 lines
10 KiB
Python
"""Tests for ZUGFeRD extractor.
|
|
|
|
Tests are written following TDD: FAILING TESTS FIRST (RED phase),
|
|
then implementation makes them pass (GREEN phase).
|
|
"""
|
|
|
|
import pytest
|
|
import base64
|
|
|
|
|
|
class TestExtractionError:
|
|
"""Test ExtractionError exception class."""
|
|
|
|
def test_extraction_error_initialization(self):
|
|
"""Test ExtractionError can be created with all fields."""
|
|
from src.extractor import ExtractionError
|
|
|
|
error = ExtractionError(
|
|
error_code="corrupt_pdf",
|
|
message="PDF is corrupted",
|
|
details="Trailer not found",
|
|
)
|
|
assert error.error_code == "corrupt_pdf"
|
|
assert error.message == "PDF is corrupted"
|
|
assert error.details == "Trailer not found"
|
|
|
|
def test_extraction_error_without_details(self):
|
|
"""Test ExtractionError can be created without details."""
|
|
from src.extractor import ExtractionError
|
|
|
|
error = ExtractionError(error_code="invalid_pdf", message="Not a PDF file")
|
|
assert error.error_code == "invalid_pdf"
|
|
assert error.message == "Not a PDF file"
|
|
assert error.details == ""
|
|
|
|
def test_extraction_error_is_exception(self):
|
|
"""Test ExtractionError is a proper exception."""
|
|
from src.extractor import ExtractionError
|
|
|
|
error = ExtractionError(error_code="file_too_large", message="File too large")
|
|
assert isinstance(error, Exception)
|
|
assert str(error) == "File too large"
|
|
|
|
|
|
class TestFileSizeValidation:
|
|
"""Test file size validation in extract_zugferd()."""
|
|
|
|
def test_file_size_limit_exactly_10mb(self):
|
|
"""Test PDF exactly at 10MB limit is accepted."""
|
|
from src.extractor import extract_zugferd, ExtractionError
|
|
|
|
"""Test PDF exactly at 10MB limit is accepted."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
# 10MB = 10 * 1024 * 1024 bytes
|
|
large_pdf = b"X" * (10 * 1024 * 1024)
|
|
|
|
# Should raise file_too_large error
|
|
with pytest.raises(ExtractionError) as exc_info:
|
|
extract_zugferd(large_pdf)
|
|
|
|
assert exc_info.value.error_code == "file_too_large"
|
|
|
|
def test_file_size_limit_10mb_plus_one_byte(self):
|
|
"""Test PDF one byte over 10MB limit is rejected."""
|
|
from src.extractor import extract_zugferd, ExtractionError
|
|
|
|
# 10MB + 1 byte
|
|
too_large = b"X" * (10 * 1024 * 1024 + 1)
|
|
|
|
with pytest.raises(ExtractionError) as exc_info:
|
|
extract_zugferd(too_large)
|
|
|
|
assert exc_info.value.error_code == "file_too_large"
|
|
|
|
def test_file_size_under_10mb_accepted(self):
|
|
"""Test PDF under 10MB is accepted for processing."""
|
|
from src.extractor import extract_zugferd, ExtractionError
|
|
|
|
# Small PDF (9MB)
|
|
small_pdf = b"X" * (9 * 1024 * 1024)
|
|
|
|
# Should process (even if invalid PDF, different error)
|
|
try:
|
|
extract_zugferd(small_pdf)
|
|
except ExtractionError as e:
|
|
# Different error is expected (not file_too_large)
|
|
assert e.error_code != "file_too_large"
|
|
|
|
|
|
class TestNonZUGFeRDPDF:
|
|
"""Test extraction from PDF without ZUGFeRD XML."""
|
|
|
|
def test_non_zugferd_pdf(self):
|
|
"""Test PDF without ZUGFeRD XML returns is_zugferd=False."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
# Load non-ZUGFeRD sample PDF
|
|
with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.is_zugferd is False
|
|
assert result.zugferd_profil is None
|
|
assert result.xml_raw is None
|
|
assert result.xml_data is None
|
|
assert result.pdf_text is not None
|
|
assert len(result.pdf_text) > 0
|
|
assert result.extraction_meta.pages >= 1
|
|
assert result.extraction_meta.extraction_time_ms >= 0
|
|
|
|
|
|
class TestEN16931Extraction:
|
|
"""Test extraction from EN16931 profile PDF."""
|
|
|
|
def test_extract_en16931_profile(self):
|
|
"""Test EN16931 PDF extraction detects correct profile."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.is_zugferd is True
|
|
assert result.zugferd_profil == "EN16931"
|
|
assert result.xml_raw is not None
|
|
assert len(result.xml_raw) > 0
|
|
assert result.xml_data is not None
|
|
assert result.pdf_text is not None
|
|
assert result.extraction_meta.xml_attachment_name is not None
|
|
assert result.extraction_meta.pages >= 1
|
|
assert result.extraction_meta.extraction_time_ms >= 0
|
|
|
|
def test_extract_all_required_fields(self):
|
|
"""Test all XmlData fields are populated from EN16931."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.xml_data is not None
|
|
xml_data = result.xml_data
|
|
|
|
# Required fields
|
|
assert xml_data.invoice_number is not None and len(xml_data.invoice_number) > 0
|
|
assert xml_data.invoice_date is not None and len(xml_data.invoice_date) > 0
|
|
assert xml_data.supplier is not None
|
|
assert xml_data.buyer is not None
|
|
assert xml_data.line_items is not None
|
|
assert xml_data.totals is not None
|
|
|
|
# Supplier fields
|
|
assert xml_data.supplier.name is not None and len(xml_data.supplier.name) > 0
|
|
|
|
# Buyer fields
|
|
assert xml_data.buyer.name is not None and len(xml_data.buyer.name) > 0
|
|
|
|
# Line items
|
|
assert len(xml_data.line_items) > 0
|
|
first_item = xml_data.line_items[0]
|
|
assert first_item.position >= 1
|
|
assert first_item.description is not None and len(first_item.description) > 0
|
|
assert first_item.quantity > 0
|
|
assert first_item.unit is not None and len(first_item.unit) > 0
|
|
assert first_item.unit_price > 0
|
|
assert first_item.line_total > 0
|
|
|
|
# Totals
|
|
assert xml_data.totals.line_total_sum > 0
|
|
assert xml_data.totals.net > 0
|
|
assert xml_data.totals.vat_total >= 0
|
|
assert xml_data.totals.gross > 0
|
|
|
|
|
|
class TestErrorHandling:
|
|
"""Test error handling for various PDF issues."""
|
|
|
|
def test_corrupt_pdf_raises_error(self):
|
|
"""Test corrupt PDF raises ExtractionError with correct code."""
|
|
from src.extractor import extract_zugferd, ExtractionError
|
|
|
|
# Invalid PDF data
|
|
corrupt_pdf = b"NOT A PDF FILE AT ALL"
|
|
|
|
with pytest.raises(ExtractionError) as exc_info:
|
|
extract_zugferd(corrupt_pdf)
|
|
|
|
# Should raise either corrupt_pdf or invalid_pdf
|
|
assert exc_info.value.error_code in ["corrupt_pdf", "invalid_pdf"]
|
|
|
|
def test_empty_pdf_raises_error(self):
|
|
"""Test empty PDF raises ExtractionError."""
|
|
from src.extractor import extract_zugferd, ExtractionError
|
|
|
|
with pytest.raises(ExtractionError):
|
|
extract_zugferd(b"")
|
|
|
|
def test_invalid_base64(self):
|
|
"""Test invalid base64 raises ExtractionError."""
|
|
from src.extractor import extract_zugferd, ExtractionError
|
|
|
|
# This would be called by API layer, but we can test the concept
|
|
# Invalid PDF that's not valid base64-encoded
|
|
try:
|
|
invalid_base64 = b"$$$INVALID$$$"
|
|
# If API layer decodes invalid base64, it gets error
|
|
decoded = base64.b64decode(invalid_base64, validate=True)
|
|
extract_zugferd(decoded)
|
|
except (base64.binascii.Error, ValueError):
|
|
# base64 error is expected
|
|
pass
|
|
except ExtractionError as e:
|
|
# Or extraction error from invalid PDF
|
|
assert e.error_code in ["invalid_pdf", "corrupt_pdf"]
|
|
|
|
|
|
class TestPDFTextExtraction:
|
|
"""Test PDF text extraction."""
|
|
|
|
def test_pdf_text_extraction(self):
|
|
"""Test PDF text is extracted correctly."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.pdf_text is not None
|
|
assert len(result.pdf_text) > 0
|
|
# Should contain some common German invoice terms
|
|
text_lower = result.pdf_text.lower()
|
|
# PDF text may contain invoice-related terms in German or English
|
|
|
|
|
|
class TestExtractionMeta:
|
|
"""Test extraction metadata."""
|
|
|
|
def test_extraction_meta_populated(self):
|
|
"""Test extraction metadata is populated correctly."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.extraction_meta is not None
|
|
assert result.extraction_meta.pages >= 1
|
|
assert result.extraction_meta.extraction_time_ms >= 0
|
|
|
|
def test_extraction_meta_non_zugferd(self):
|
|
"""Test extraction metadata for non-ZUGFeRD PDF."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.extraction_meta is not None
|
|
assert result.extraction_meta.pages >= 1
|
|
assert result.extraction_meta.extraction_time_ms >= 0
|
|
assert result.extraction_meta.xml_attachment_name is None
|
|
|
|
|
|
class TestExtendedProfile:
|
|
"""Test extraction from EXTENDED profile PDF (if available)."""
|
|
|
|
def test_extract_extended_profile(self):
|
|
"""Test EXTENDED PDF extraction detects correct profile."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
with open("tests/fixtures/zugferd_2p1_EXTENDED_PDFA-3A.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.is_zugferd is True
|
|
assert result.zugferd_profil == "EXTENDED"
|
|
assert result.xml_data is not None
|
|
|
|
|
|
class TestZUGFeRDProfileVariations:
|
|
"""Test various ZUGFeRD profile detection."""
|
|
|
|
def test_detect_basicwl_profile(self):
|
|
"""Test BASIC WL profile detection."""
|
|
from src.extractor import extract_zugferd
|
|
|
|
with open("tests/fixtures/validAvoir_FR_type380_BASICWL.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
result = extract_zugferd(pdf_bytes)
|
|
|
|
assert result.is_zugferd is True
|
|
# Profile should be detected (BASIC, BASICWL, etc.)
|
|
assert result.zugferd_profil is not None
|
|
assert result.xml_data is not None
|