Files
zugferd-service/tests/test_pdf_parser.py
m3tm3re c1f603cd46 feat(core): implement extractor, pdf_parser, and utils with TDD
Wave 2 tasks complete:
- Task 4: ZUGFeRD extractor with profile detection (factur-x)
- Task 5: PDF text parser with regex patterns
- Task 6: Utils with unit code mapping and tolerance checks

Features:
- extract_zugferd() extracts XML and text from PDFs
- parse_zugferd_xml() parses UN/CEFACT CII XML to models
- extract_from_text() extracts values using regex patterns
- translate_unit_code() maps UN/ECE codes to German
- amounts_match() checks with 0.01 EUR tolerance
- German number/date format handling

Tests: 27 utils tests, 27 pdf_parser tests, extractor tests
2026-02-04 19:42:32 +01:00

309 lines
9.5 KiB
Python

"""
Unit tests for PDF text extraction and parsing.
TDD approach: Tests written first, implementation follows.
"""
import pytest
from src.pdf_parser import extract_text_from_pdf, extract_from_text
class TestExtractTextFromPDF:
"""Test PDF text extraction using pypdf."""
def test_extract_text_from_sample_pdf(self):
"""Extract text from EN16931_Einfach.pdf sample."""
# Load the test PDF
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
pdf_bytes = f.read()
# Extract text
text = extract_text_from_pdf(pdf_bytes)
# Verify text was extracted
assert text is not None
assert len(text) > 0
# Verify key content is present
assert "Lieferant GmbH" in text
assert "Rechnung" in text
def test_extract_text_from_empty_pdf(self):
"""Handle empty PDF gracefully."""
with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
pdf_bytes = f.read()
text = extract_text_from_pdf(pdf_bytes)
# Should return empty string or minimal content
assert isinstance(text, str)
def test_extract_text_from_invalid_pdf(self):
"""Handle invalid PDF bytes gracefully."""
invalid_pdf = b"Not a valid PDF"
# Should raise an appropriate error
with pytest.raises(Exception):
extract_text_from_pdf(invalid_pdf)
class TestExtractFromText:
"""Test invoice field extraction from text using regex patterns."""
def test_extract_invoice_number_german(self):
"""Extract German invoice number format."""
text = "Rechnungs-Nr: RE-2025-001234"
result = extract_from_text(text)
assert "invoice_number" in result
assert result["invoice_number"] == "RE-2025-001234"
assert "invoice_number_confidence" in result
assert result["invoice_number_confidence"] > 0.8
def test_extract_invoice_number_english(self):
"""Extract English invoice number format."""
text = "Invoice No: INV-2025-001234"
result = extract_from_text(text)
assert "invoice_number" in result
assert result["invoice_number"] == "INV-2025-001234"
def test_extract_invoice_number_beleg(self):
"""Extract Beleg-Nr format."""
text = "Beleg-Nr: 471102"
result = extract_from_text(text)
assert "invoice_number" in result
assert result["invoice_number"] == "471102"
def test_extract_invoice_date_german(self):
"""Extract German date format and convert to ISO."""
text = "Rechnungsdatum: 04.02.2025"
result = extract_from_text(text)
assert "invoice_date" in result
assert result["invoice_date"] == "2025-02-04"
def test_extract_invoice_date_iso(self):
"""Extract ISO date format."""
text = "Invoice Date: 2025-02-04"
result = extract_from_text(text)
assert "invoice_date" in result
assert result["invoice_date"] == "2025-02-04"
def test_extract_gross_amount_german(self):
"""Extract gross amount with German format."""
text = "Brutto: 1.234,56 EUR"
result = extract_from_text(text)
assert "gross_amount" in result
assert result["gross_amount"] == 1234.56
assert "gross_amount_confidence" in result
def test_extract_gross_amount_variations(self):
"""Test various gross amount labels."""
variations = [
("Brutto: 118,88", 118.88),
("Gesamtbetrag: 118,88 EUR", 118.88),
("Total: 118.88", 118.88),
("Endbetrag: 529,87", 529.87),
("Summe: 100,00", 100.00),
]
for text, expected in variations:
result = extract_from_text(text)
assert "gross_amount" in result
assert result["gross_amount"] == expected
def test_extract_net_amount(self):
"""Extract net amount."""
text = "Netto: 100,00 EUR"
result = extract_from_text(text)
assert "net_amount" in result
assert result["net_amount"] == 100.00
assert "net_amount_confidence" in result
def test_extract_net_amount_rechnungsbetrag(self):
"""Extract net amount with alternative label."""
text = "Rechnungsbetrag: 473,00"
result = extract_from_text(text)
assert "net_amount" in result
assert result["net_amount"] == 473.00
def test_extract_vat_amount(self):
"""Extract VAT amount."""
text = "MwSt: 18,88 EUR"
result = extract_from_text(text)
assert "vat_amount" in result
assert result["vat_amount"] == 18.88
assert "vat_amount_confidence" in result
def test_extract_vat_amount_variations(self):
"""Test various VAT amount labels."""
variations = [
("MwSt: 56,87", 56.87),
("USt: 18,88 EUR", 18.88),
("Steuer: 19,00", 19.00),
]
for text, expected in variations:
result = extract_from_text(text)
assert "vat_amount" in result
assert result["vat_amount"] == expected
def test_extract_supplier_name(self):
"""Extract supplier name."""
text = "Lieferant: Lieferant GmbH"
result = extract_from_text(text)
assert "supplier_name" in result
assert result["supplier_name"] == "Lieferant GmbH"
def test_extract_supplier_name_verkaeufer(self):
"""Extract supplier with Verkäufer label."""
text = "Verkäufer: ACME Corporation Inc."
result = extract_from_text(text)
assert "supplier_name" in result
assert result["supplier_name"] == "ACME Corporation Inc."
def test_extract_all_fields_comprehensive(self):
"""Extract all fields from realistic invoice text."""
text = """
Rechnungs-Nr: RE-2025-001234
Rechnungsdatum: 04.02.2025
Lieferant: Lieferant GmbH
Netto: 100,00 EUR
MwSt: 18,88 EUR
Brutto: 118,88 EUR
"""
result = extract_from_text(text)
assert result["invoice_number"] == "RE-2025-001234"
assert result["invoice_date"] == "2025-02-04"
assert result["supplier_name"] == "Lieferant GmbH"
assert result["net_amount"] == 100.00
assert result["vat_amount"] == 18.88
assert result["gross_amount"] == 118.88
def test_confidence_scores_in_range(self):
"""Verify all confidence scores are in 0.0-1.0 range."""
text = """
Rechnungs-Nr: RE-2025-001234
Rechnungsdatum: 04.02.2025
Lieferant: Lieferant GmbH
Netto: 100,00 EUR
MwSt: 18,88 EUR
Brutto: 118,88 EUR
"""
result = extract_from_text(text)
confidence_fields = [k for k in result.keys() if k.endswith("_confidence")]
for field in confidence_fields:
assert isinstance(result[field], (int, float))
assert 0.0 <= result[field] <= 1.0
def test_empty_text(self):
"""Handle empty input text gracefully."""
result = extract_from_text("")
# Should return empty dict or dict with None values
assert isinstance(result, dict)
def test_no_matches(self):
"""Handle text with no matches."""
text = "This is just random text with no invoice data."
result = extract_from_text(text)
assert isinstance(result, dict)
# Values should be None or missing
class TestGermanNumberFormat:
"""Test German number format conversion."""
def test_simple_decimal(self):
"""Convert simple German decimal: 123,45"""
text = "Brutto: 123,45"
result = extract_from_text(text)
assert result["gross_amount"] == 123.45
def test_thousands_separator(self):
"""Convert with thousands: 1.234,56"""
text = "Brutto: 1.234,56"
result = extract_from_text(text)
assert result["gross_amount"] == 1234.56
def test_large_amount(self):
"""Convert large amount: 10.000,00"""
text = "Brutto: 10.000,00"
result = extract_from_text(text)
assert result["gross_amount"] == 10000.00
def test_integer_amount(self):
"""Convert integer: 100,00"""
text = "Netto: 100,00"
result = extract_from_text(text)
assert result["net_amount"] == 100.00
class TestGermanDateFormat:
"""Test German date format conversion."""
def test_dd_mm_yyyy(self):
"""Convert DD.MM.YYYY to ISO format."""
text = "Rechnungsdatum: 15.11.2024"
result = extract_from_text(text)
assert result["invoice_date"] == "2024-11-15"
def test_d_m_yyyy(self):
"""Convert D.M.YYYY (single digits) to ISO format."""
text = "Rechnungsdatum: 4.2.2025"
result = extract_from_text(text)
assert result["invoice_date"] == "2025-02-04"
class TestRealPDFExtraction:
"""Test extraction from actual PDF fixtures."""
def test_extract_from_en16931_sample(self):
"""Extract fields from EN16931_Einfach.pdf."""
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
pdf_bytes = f.read()
# Extract text
text = extract_text_from_pdf(pdf_bytes)
# Extract fields
result = extract_from_text(text)
# Verify key fields were found
assert result is not None
# Check if at least some fields were extracted
# (exact values may vary based on PDF layout)
extracted_fields = [
k
for k, v in result.items()
if v is not None and not k.endswith("_confidence")
]
assert len(extracted_fields) > 0