Wave 2 tasks complete: - Task 4: ZUGFeRD extractor with profile detection (factur-x) - Task 5: PDF text parser with regex patterns - Task 6: Utils with unit code mapping and tolerance checks Features: - extract_zugferd() extracts XML and text from PDFs - parse_zugferd_xml() parses UN/CEFACT CII XML to models - extract_from_text() extracts values using regex patterns - translate_unit_code() maps UN/ECE codes to German - amounts_match() checks with 0.01 EUR tolerance - German number/date format handling Tests: 27 utils tests, 27 pdf_parser tests, extractor tests
309 lines
9.5 KiB
Python
309 lines
9.5 KiB
Python
"""
|
|
Unit tests for PDF text extraction and parsing.
|
|
|
|
TDD approach: Tests written first, implementation follows.
|
|
"""
|
|
|
|
import pytest
|
|
from src.pdf_parser import extract_text_from_pdf, extract_from_text
|
|
|
|
|
|
class TestExtractTextFromPDF:
|
|
"""Test PDF text extraction using pypdf."""
|
|
|
|
def test_extract_text_from_sample_pdf(self):
|
|
"""Extract text from EN16931_Einfach.pdf sample."""
|
|
# Load the test PDF
|
|
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
# Extract text
|
|
text = extract_text_from_pdf(pdf_bytes)
|
|
|
|
# Verify text was extracted
|
|
assert text is not None
|
|
assert len(text) > 0
|
|
|
|
# Verify key content is present
|
|
assert "Lieferant GmbH" in text
|
|
assert "Rechnung" in text
|
|
|
|
def test_extract_text_from_empty_pdf(self):
|
|
"""Handle empty PDF gracefully."""
|
|
with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
text = extract_text_from_pdf(pdf_bytes)
|
|
|
|
# Should return empty string or minimal content
|
|
assert isinstance(text, str)
|
|
|
|
def test_extract_text_from_invalid_pdf(self):
|
|
"""Handle invalid PDF bytes gracefully."""
|
|
invalid_pdf = b"Not a valid PDF"
|
|
|
|
# Should raise an appropriate error
|
|
with pytest.raises(Exception):
|
|
extract_text_from_pdf(invalid_pdf)
|
|
|
|
|
|
class TestExtractFromText:
|
|
"""Test invoice field extraction from text using regex patterns."""
|
|
|
|
def test_extract_invoice_number_german(self):
|
|
"""Extract German invoice number format."""
|
|
text = "Rechnungs-Nr: RE-2025-001234"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "invoice_number" in result
|
|
assert result["invoice_number"] == "RE-2025-001234"
|
|
assert "invoice_number_confidence" in result
|
|
assert result["invoice_number_confidence"] > 0.8
|
|
|
|
def test_extract_invoice_number_english(self):
|
|
"""Extract English invoice number format."""
|
|
text = "Invoice No: INV-2025-001234"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "invoice_number" in result
|
|
assert result["invoice_number"] == "INV-2025-001234"
|
|
|
|
def test_extract_invoice_number_beleg(self):
|
|
"""Extract Beleg-Nr format."""
|
|
text = "Beleg-Nr: 471102"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "invoice_number" in result
|
|
assert result["invoice_number"] == "471102"
|
|
|
|
def test_extract_invoice_date_german(self):
|
|
"""Extract German date format and convert to ISO."""
|
|
text = "Rechnungsdatum: 04.02.2025"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "invoice_date" in result
|
|
assert result["invoice_date"] == "2025-02-04"
|
|
|
|
def test_extract_invoice_date_iso(self):
|
|
"""Extract ISO date format."""
|
|
text = "Invoice Date: 2025-02-04"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "invoice_date" in result
|
|
assert result["invoice_date"] == "2025-02-04"
|
|
|
|
def test_extract_gross_amount_german(self):
|
|
"""Extract gross amount with German format."""
|
|
text = "Brutto: 1.234,56 EUR"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "gross_amount" in result
|
|
assert result["gross_amount"] == 1234.56
|
|
assert "gross_amount_confidence" in result
|
|
|
|
def test_extract_gross_amount_variations(self):
|
|
"""Test various gross amount labels."""
|
|
variations = [
|
|
("Brutto: 118,88", 118.88),
|
|
("Gesamtbetrag: 118,88 EUR", 118.88),
|
|
("Total: 118.88", 118.88),
|
|
("Endbetrag: 529,87", 529.87),
|
|
("Summe: 100,00", 100.00),
|
|
]
|
|
|
|
for text, expected in variations:
|
|
result = extract_from_text(text)
|
|
assert "gross_amount" in result
|
|
assert result["gross_amount"] == expected
|
|
|
|
def test_extract_net_amount(self):
|
|
"""Extract net amount."""
|
|
text = "Netto: 100,00 EUR"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "net_amount" in result
|
|
assert result["net_amount"] == 100.00
|
|
assert "net_amount_confidence" in result
|
|
|
|
def test_extract_net_amount_rechnungsbetrag(self):
|
|
"""Extract net amount with alternative label."""
|
|
text = "Rechnungsbetrag: 473,00"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "net_amount" in result
|
|
assert result["net_amount"] == 473.00
|
|
|
|
def test_extract_vat_amount(self):
|
|
"""Extract VAT amount."""
|
|
text = "MwSt: 18,88 EUR"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "vat_amount" in result
|
|
assert result["vat_amount"] == 18.88
|
|
assert "vat_amount_confidence" in result
|
|
|
|
def test_extract_vat_amount_variations(self):
|
|
"""Test various VAT amount labels."""
|
|
variations = [
|
|
("MwSt: 56,87", 56.87),
|
|
("USt: 18,88 EUR", 18.88),
|
|
("Steuer: 19,00", 19.00),
|
|
]
|
|
|
|
for text, expected in variations:
|
|
result = extract_from_text(text)
|
|
assert "vat_amount" in result
|
|
assert result["vat_amount"] == expected
|
|
|
|
def test_extract_supplier_name(self):
|
|
"""Extract supplier name."""
|
|
text = "Lieferant: Lieferant GmbH"
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "supplier_name" in result
|
|
assert result["supplier_name"] == "Lieferant GmbH"
|
|
|
|
def test_extract_supplier_name_verkaeufer(self):
|
|
"""Extract supplier with Verkäufer label."""
|
|
text = "Verkäufer: ACME Corporation Inc."
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert "supplier_name" in result
|
|
assert result["supplier_name"] == "ACME Corporation Inc."
|
|
|
|
def test_extract_all_fields_comprehensive(self):
|
|
"""Extract all fields from realistic invoice text."""
|
|
text = """
|
|
Rechnungs-Nr: RE-2025-001234
|
|
Rechnungsdatum: 04.02.2025
|
|
Lieferant: Lieferant GmbH
|
|
Netto: 100,00 EUR
|
|
MwSt: 18,88 EUR
|
|
Brutto: 118,88 EUR
|
|
"""
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert result["invoice_number"] == "RE-2025-001234"
|
|
assert result["invoice_date"] == "2025-02-04"
|
|
assert result["supplier_name"] == "Lieferant GmbH"
|
|
assert result["net_amount"] == 100.00
|
|
assert result["vat_amount"] == 18.88
|
|
assert result["gross_amount"] == 118.88
|
|
|
|
def test_confidence_scores_in_range(self):
|
|
"""Verify all confidence scores are in 0.0-1.0 range."""
|
|
text = """
|
|
Rechnungs-Nr: RE-2025-001234
|
|
Rechnungsdatum: 04.02.2025
|
|
Lieferant: Lieferant GmbH
|
|
Netto: 100,00 EUR
|
|
MwSt: 18,88 EUR
|
|
Brutto: 118,88 EUR
|
|
"""
|
|
|
|
result = extract_from_text(text)
|
|
|
|
confidence_fields = [k for k in result.keys() if k.endswith("_confidence")]
|
|
|
|
for field in confidence_fields:
|
|
assert isinstance(result[field], (int, float))
|
|
assert 0.0 <= result[field] <= 1.0
|
|
|
|
def test_empty_text(self):
|
|
"""Handle empty input text gracefully."""
|
|
result = extract_from_text("")
|
|
|
|
# Should return empty dict or dict with None values
|
|
assert isinstance(result, dict)
|
|
|
|
def test_no_matches(self):
|
|
"""Handle text with no matches."""
|
|
text = "This is just random text with no invoice data."
|
|
|
|
result = extract_from_text(text)
|
|
|
|
assert isinstance(result, dict)
|
|
# Values should be None or missing
|
|
|
|
|
|
class TestGermanNumberFormat:
|
|
"""Test German number format conversion."""
|
|
|
|
def test_simple_decimal(self):
|
|
"""Convert simple German decimal: 123,45"""
|
|
text = "Brutto: 123,45"
|
|
result = extract_from_text(text)
|
|
assert result["gross_amount"] == 123.45
|
|
|
|
def test_thousands_separator(self):
|
|
"""Convert with thousands: 1.234,56"""
|
|
text = "Brutto: 1.234,56"
|
|
result = extract_from_text(text)
|
|
assert result["gross_amount"] == 1234.56
|
|
|
|
def test_large_amount(self):
|
|
"""Convert large amount: 10.000,00"""
|
|
text = "Brutto: 10.000,00"
|
|
result = extract_from_text(text)
|
|
assert result["gross_amount"] == 10000.00
|
|
|
|
def test_integer_amount(self):
|
|
"""Convert integer: 100,00"""
|
|
text = "Netto: 100,00"
|
|
result = extract_from_text(text)
|
|
assert result["net_amount"] == 100.00
|
|
|
|
|
|
class TestGermanDateFormat:
|
|
"""Test German date format conversion."""
|
|
|
|
def test_dd_mm_yyyy(self):
|
|
"""Convert DD.MM.YYYY to ISO format."""
|
|
text = "Rechnungsdatum: 15.11.2024"
|
|
result = extract_from_text(text)
|
|
assert result["invoice_date"] == "2024-11-15"
|
|
|
|
def test_d_m_yyyy(self):
|
|
"""Convert D.M.YYYY (single digits) to ISO format."""
|
|
text = "Rechnungsdatum: 4.2.2025"
|
|
result = extract_from_text(text)
|
|
assert result["invoice_date"] == "2025-02-04"
|
|
|
|
|
|
class TestRealPDFExtraction:
|
|
"""Test extraction from actual PDF fixtures."""
|
|
|
|
def test_extract_from_en16931_sample(self):
|
|
"""Extract fields from EN16931_Einfach.pdf."""
|
|
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
# Extract text
|
|
text = extract_text_from_pdf(pdf_bytes)
|
|
|
|
# Extract fields
|
|
result = extract_from_text(text)
|
|
|
|
# Verify key fields were found
|
|
assert result is not None
|
|
# Check if at least some fields were extracted
|
|
# (exact values may vary based on PDF layout)
|
|
extracted_fields = [
|
|
k
|
|
for k, v in result.items()
|
|
if v is not None and not k.endswith("_confidence")
|
|
]
|
|
assert len(extracted_fields) > 0
|