feat(core): implement extractor, pdf_parser, and utils with TDD
Wave 2 tasks complete: - Task 4: ZUGFeRD extractor with profile detection (factur-x) - Task 5: PDF text parser with regex patterns - Task 6: Utils with unit code mapping and tolerance checks Features: - extract_zugferd() extracts XML and text from PDFs - parse_zugferd_xml() parses UN/CEFACT CII XML to models - extract_from_text() extracts values using regex patterns - translate_unit_code() maps UN/ECE codes to German - amounts_match() checks with 0.01 EUR tolerance - German number/date format handling Tests: 27 utils tests, 27 pdf_parser tests, extractor tests
This commit is contained in:
308
tests/test_pdf_parser.py
Normal file
308
tests/test_pdf_parser.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""
|
||||
Unit tests for PDF text extraction and parsing.
|
||||
|
||||
TDD approach: Tests written first, implementation follows.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from src.pdf_parser import extract_text_from_pdf, extract_from_text
|
||||
|
||||
|
||||
class TestExtractTextFromPDF:
|
||||
"""Test PDF text extraction using pypdf."""
|
||||
|
||||
def test_extract_text_from_sample_pdf(self):
|
||||
"""Extract text from EN16931_Einfach.pdf sample."""
|
||||
# Load the test PDF
|
||||
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
# Extract text
|
||||
text = extract_text_from_pdf(pdf_bytes)
|
||||
|
||||
# Verify text was extracted
|
||||
assert text is not None
|
||||
assert len(text) > 0
|
||||
|
||||
# Verify key content is present
|
||||
assert "Lieferant GmbH" in text
|
||||
assert "Rechnung" in text
|
||||
|
||||
def test_extract_text_from_empty_pdf(self):
|
||||
"""Handle empty PDF gracefully."""
|
||||
with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
text = extract_text_from_pdf(pdf_bytes)
|
||||
|
||||
# Should return empty string or minimal content
|
||||
assert isinstance(text, str)
|
||||
|
||||
def test_extract_text_from_invalid_pdf(self):
|
||||
"""Handle invalid PDF bytes gracefully."""
|
||||
invalid_pdf = b"Not a valid PDF"
|
||||
|
||||
# Should raise an appropriate error
|
||||
with pytest.raises(Exception):
|
||||
extract_text_from_pdf(invalid_pdf)
|
||||
|
||||
|
||||
class TestExtractFromText:
|
||||
"""Test invoice field extraction from text using regex patterns."""
|
||||
|
||||
def test_extract_invoice_number_german(self):
|
||||
"""Extract German invoice number format."""
|
||||
text = "Rechnungs-Nr: RE-2025-001234"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "invoice_number" in result
|
||||
assert result["invoice_number"] == "RE-2025-001234"
|
||||
assert "invoice_number_confidence" in result
|
||||
assert result["invoice_number_confidence"] > 0.8
|
||||
|
||||
def test_extract_invoice_number_english(self):
|
||||
"""Extract English invoice number format."""
|
||||
text = "Invoice No: INV-2025-001234"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "invoice_number" in result
|
||||
assert result["invoice_number"] == "INV-2025-001234"
|
||||
|
||||
def test_extract_invoice_number_beleg(self):
|
||||
"""Extract Beleg-Nr format."""
|
||||
text = "Beleg-Nr: 471102"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "invoice_number" in result
|
||||
assert result["invoice_number"] == "471102"
|
||||
|
||||
def test_extract_invoice_date_german(self):
|
||||
"""Extract German date format and convert to ISO."""
|
||||
text = "Rechnungsdatum: 04.02.2025"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "invoice_date" in result
|
||||
assert result["invoice_date"] == "2025-02-04"
|
||||
|
||||
def test_extract_invoice_date_iso(self):
|
||||
"""Extract ISO date format."""
|
||||
text = "Invoice Date: 2025-02-04"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "invoice_date" in result
|
||||
assert result["invoice_date"] == "2025-02-04"
|
||||
|
||||
def test_extract_gross_amount_german(self):
|
||||
"""Extract gross amount with German format."""
|
||||
text = "Brutto: 1.234,56 EUR"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "gross_amount" in result
|
||||
assert result["gross_amount"] == 1234.56
|
||||
assert "gross_amount_confidence" in result
|
||||
|
||||
def test_extract_gross_amount_variations(self):
|
||||
"""Test various gross amount labels."""
|
||||
variations = [
|
||||
("Brutto: 118,88", 118.88),
|
||||
("Gesamtbetrag: 118,88 EUR", 118.88),
|
||||
("Total: 118.88", 118.88),
|
||||
("Endbetrag: 529,87", 529.87),
|
||||
("Summe: 100,00", 100.00),
|
||||
]
|
||||
|
||||
for text, expected in variations:
|
||||
result = extract_from_text(text)
|
||||
assert "gross_amount" in result
|
||||
assert result["gross_amount"] == expected
|
||||
|
||||
def test_extract_net_amount(self):
|
||||
"""Extract net amount."""
|
||||
text = "Netto: 100,00 EUR"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "net_amount" in result
|
||||
assert result["net_amount"] == 100.00
|
||||
assert "net_amount_confidence" in result
|
||||
|
||||
def test_extract_net_amount_rechnungsbetrag(self):
|
||||
"""Extract net amount with alternative label."""
|
||||
text = "Rechnungsbetrag: 473,00"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "net_amount" in result
|
||||
assert result["net_amount"] == 473.00
|
||||
|
||||
def test_extract_vat_amount(self):
|
||||
"""Extract VAT amount."""
|
||||
text = "MwSt: 18,88 EUR"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "vat_amount" in result
|
||||
assert result["vat_amount"] == 18.88
|
||||
assert "vat_amount_confidence" in result
|
||||
|
||||
def test_extract_vat_amount_variations(self):
|
||||
"""Test various VAT amount labels."""
|
||||
variations = [
|
||||
("MwSt: 56,87", 56.87),
|
||||
("USt: 18,88 EUR", 18.88),
|
||||
("Steuer: 19,00", 19.00),
|
||||
]
|
||||
|
||||
for text, expected in variations:
|
||||
result = extract_from_text(text)
|
||||
assert "vat_amount" in result
|
||||
assert result["vat_amount"] == expected
|
||||
|
||||
def test_extract_supplier_name(self):
|
||||
"""Extract supplier name."""
|
||||
text = "Lieferant: Lieferant GmbH"
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "supplier_name" in result
|
||||
assert result["supplier_name"] == "Lieferant GmbH"
|
||||
|
||||
def test_extract_supplier_name_verkaeufer(self):
|
||||
"""Extract supplier with Verkäufer label."""
|
||||
text = "Verkäufer: ACME Corporation Inc."
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert "supplier_name" in result
|
||||
assert result["supplier_name"] == "ACME Corporation Inc."
|
||||
|
||||
def test_extract_all_fields_comprehensive(self):
|
||||
"""Extract all fields from realistic invoice text."""
|
||||
text = """
|
||||
Rechnungs-Nr: RE-2025-001234
|
||||
Rechnungsdatum: 04.02.2025
|
||||
Lieferant: Lieferant GmbH
|
||||
Netto: 100,00 EUR
|
||||
MwSt: 18,88 EUR
|
||||
Brutto: 118,88 EUR
|
||||
"""
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert result["invoice_number"] == "RE-2025-001234"
|
||||
assert result["invoice_date"] == "2025-02-04"
|
||||
assert result["supplier_name"] == "Lieferant GmbH"
|
||||
assert result["net_amount"] == 100.00
|
||||
assert result["vat_amount"] == 18.88
|
||||
assert result["gross_amount"] == 118.88
|
||||
|
||||
def test_confidence_scores_in_range(self):
|
||||
"""Verify all confidence scores are in 0.0-1.0 range."""
|
||||
text = """
|
||||
Rechnungs-Nr: RE-2025-001234
|
||||
Rechnungsdatum: 04.02.2025
|
||||
Lieferant: Lieferant GmbH
|
||||
Netto: 100,00 EUR
|
||||
MwSt: 18,88 EUR
|
||||
Brutto: 118,88 EUR
|
||||
"""
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
confidence_fields = [k for k in result.keys() if k.endswith("_confidence")]
|
||||
|
||||
for field in confidence_fields:
|
||||
assert isinstance(result[field], (int, float))
|
||||
assert 0.0 <= result[field] <= 1.0
|
||||
|
||||
def test_empty_text(self):
|
||||
"""Handle empty input text gracefully."""
|
||||
result = extract_from_text("")
|
||||
|
||||
# Should return empty dict or dict with None values
|
||||
assert isinstance(result, dict)
|
||||
|
||||
def test_no_matches(self):
|
||||
"""Handle text with no matches."""
|
||||
text = "This is just random text with no invoice data."
|
||||
|
||||
result = extract_from_text(text)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
# Values should be None or missing
|
||||
|
||||
|
||||
class TestGermanNumberFormat:
|
||||
"""Test German number format conversion."""
|
||||
|
||||
def test_simple_decimal(self):
|
||||
"""Convert simple German decimal: 123,45"""
|
||||
text = "Brutto: 123,45"
|
||||
result = extract_from_text(text)
|
||||
assert result["gross_amount"] == 123.45
|
||||
|
||||
def test_thousands_separator(self):
|
||||
"""Convert with thousands: 1.234,56"""
|
||||
text = "Brutto: 1.234,56"
|
||||
result = extract_from_text(text)
|
||||
assert result["gross_amount"] == 1234.56
|
||||
|
||||
def test_large_amount(self):
|
||||
"""Convert large amount: 10.000,00"""
|
||||
text = "Brutto: 10.000,00"
|
||||
result = extract_from_text(text)
|
||||
assert result["gross_amount"] == 10000.00
|
||||
|
||||
def test_integer_amount(self):
|
||||
"""Convert integer: 100,00"""
|
||||
text = "Netto: 100,00"
|
||||
result = extract_from_text(text)
|
||||
assert result["net_amount"] == 100.00
|
||||
|
||||
|
||||
class TestGermanDateFormat:
|
||||
"""Test German date format conversion."""
|
||||
|
||||
def test_dd_mm_yyyy(self):
|
||||
"""Convert DD.MM.YYYY to ISO format."""
|
||||
text = "Rechnungsdatum: 15.11.2024"
|
||||
result = extract_from_text(text)
|
||||
assert result["invoice_date"] == "2024-11-15"
|
||||
|
||||
def test_d_m_yyyy(self):
|
||||
"""Convert D.M.YYYY (single digits) to ISO format."""
|
||||
text = "Rechnungsdatum: 4.2.2025"
|
||||
result = extract_from_text(text)
|
||||
assert result["invoice_date"] == "2025-02-04"
|
||||
|
||||
|
||||
class TestRealPDFExtraction:
|
||||
"""Test extraction from actual PDF fixtures."""
|
||||
|
||||
def test_extract_from_en16931_sample(self):
|
||||
"""Extract fields from EN16931_Einfach.pdf."""
|
||||
with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
# Extract text
|
||||
text = extract_text_from_pdf(pdf_bytes)
|
||||
|
||||
# Extract fields
|
||||
result = extract_from_text(text)
|
||||
|
||||
# Verify key fields were found
|
||||
assert result is not None
|
||||
# Check if at least some fields were extracted
|
||||
# (exact values may vary based on PDF layout)
|
||||
extracted_fields = [
|
||||
k
|
||||
for k, v in result.items()
|
||||
if v is not None and not k.endswith("_confidence")
|
||||
]
|
||||
assert len(extracted_fields) > 0
|
||||
Reference in New Issue
Block a user