""" Unit tests for PDF text extraction and parsing. TDD approach: Tests written first, implementation follows. """ import pytest from src.pdf_parser import extract_text_from_pdf, extract_from_text class TestExtractTextFromPDF: """Test PDF text extraction using pypdf.""" def test_extract_text_from_sample_pdf(self): """Extract text from EN16931_Einfach.pdf sample.""" # Load the test PDF with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f: pdf_bytes = f.read() # Extract text text = extract_text_from_pdf(pdf_bytes) # Verify text was extracted assert text is not None assert len(text) > 0 # Verify key content is present assert "Lieferant GmbH" in text assert "Rechnung" in text def test_extract_text_from_empty_pdf(self): """Handle empty PDF gracefully.""" with open("tests/fixtures/EmptyPDFA1.pdf", "rb") as f: pdf_bytes = f.read() text = extract_text_from_pdf(pdf_bytes) # Should return empty string or minimal content assert isinstance(text, str) def test_extract_text_from_invalid_pdf(self): """Handle invalid PDF bytes gracefully.""" invalid_pdf = b"Not a valid PDF" # Should raise an appropriate error with pytest.raises(Exception): extract_text_from_pdf(invalid_pdf) class TestExtractFromText: """Test invoice field extraction from text using regex patterns.""" def test_extract_invoice_number_german(self): """Extract German invoice number format.""" text = "Rechnungs-Nr: RE-2025-001234" result = extract_from_text(text) assert "invoice_number" in result assert result["invoice_number"] == "RE-2025-001234" assert "invoice_number_confidence" in result assert result["invoice_number_confidence"] > 0.8 def test_extract_invoice_number_english(self): """Extract English invoice number format.""" text = "Invoice No: INV-2025-001234" result = extract_from_text(text) assert "invoice_number" in result assert result["invoice_number"] == "INV-2025-001234" def test_extract_invoice_number_beleg(self): """Extract Beleg-Nr format.""" text = "Beleg-Nr: 471102" result = extract_from_text(text) assert "invoice_number" in result assert result["invoice_number"] == "471102" def test_extract_invoice_date_german(self): """Extract German date format and convert to ISO.""" text = "Rechnungsdatum: 04.02.2025" result = extract_from_text(text) assert "invoice_date" in result assert result["invoice_date"] == "2025-02-04" def test_extract_invoice_date_iso(self): """Extract ISO date format.""" text = "Invoice Date: 2025-02-04" result = extract_from_text(text) assert "invoice_date" in result assert result["invoice_date"] == "2025-02-04" def test_extract_gross_amount_german(self): """Extract gross amount with German format.""" text = "Brutto: 1.234,56 EUR" result = extract_from_text(text) assert "gross_amount" in result assert result["gross_amount"] == 1234.56 assert "gross_amount_confidence" in result def test_extract_gross_amount_variations(self): """Test various gross amount labels.""" variations = [ ("Brutto: 118,88", 118.88), ("Gesamtbetrag: 118,88 EUR", 118.88), ("Total: 118.88", 118.88), ("Endbetrag: 529,87", 529.87), ("Summe: 100,00", 100.00), ] for text, expected in variations: result = extract_from_text(text) assert "gross_amount" in result assert result["gross_amount"] == expected def test_extract_net_amount(self): """Extract net amount.""" text = "Netto: 100,00 EUR" result = extract_from_text(text) assert "net_amount" in result assert result["net_amount"] == 100.00 assert "net_amount_confidence" in result def test_extract_net_amount_rechnungsbetrag(self): """Extract net amount with alternative label.""" text = "Rechnungsbetrag: 473,00" result = extract_from_text(text) assert "net_amount" in result assert result["net_amount"] == 473.00 def test_extract_vat_amount(self): """Extract VAT amount.""" text = "MwSt: 18,88 EUR" result = extract_from_text(text) assert "vat_amount" in result assert result["vat_amount"] == 18.88 assert "vat_amount_confidence" in result def test_extract_vat_amount_variations(self): """Test various VAT amount labels.""" variations = [ ("MwSt: 56,87", 56.87), ("USt: 18,88 EUR", 18.88), ("Steuer: 19,00", 19.00), ] for text, expected in variations: result = extract_from_text(text) assert "vat_amount" in result assert result["vat_amount"] == expected def test_extract_supplier_name(self): """Extract supplier name.""" text = "Lieferant: Lieferant GmbH" result = extract_from_text(text) assert "supplier_name" in result assert result["supplier_name"] == "Lieferant GmbH" def test_extract_supplier_name_verkaeufer(self): """Extract supplier with Verkäufer label.""" text = "Verkäufer: ACME Corporation Inc." result = extract_from_text(text) assert "supplier_name" in result assert result["supplier_name"] == "ACME Corporation Inc." def test_extract_all_fields_comprehensive(self): """Extract all fields from realistic invoice text.""" text = """ Rechnungs-Nr: RE-2025-001234 Rechnungsdatum: 04.02.2025 Lieferant: Lieferant GmbH Netto: 100,00 EUR MwSt: 18,88 EUR Brutto: 118,88 EUR """ result = extract_from_text(text) assert result["invoice_number"] == "RE-2025-001234" assert result["invoice_date"] == "2025-02-04" assert result["supplier_name"] == "Lieferant GmbH" assert result["net_amount"] == 100.00 assert result["vat_amount"] == 18.88 assert result["gross_amount"] == 118.88 def test_confidence_scores_in_range(self): """Verify all confidence scores are in 0.0-1.0 range.""" text = """ Rechnungs-Nr: RE-2025-001234 Rechnungsdatum: 04.02.2025 Lieferant: Lieferant GmbH Netto: 100,00 EUR MwSt: 18,88 EUR Brutto: 118,88 EUR """ result = extract_from_text(text) confidence_fields = [k for k in result.keys() if k.endswith("_confidence")] for field in confidence_fields: assert isinstance(result[field], (int, float)) assert 0.0 <= result[field] <= 1.0 def test_empty_text(self): """Handle empty input text gracefully.""" result = extract_from_text("") # Should return empty dict or dict with None values assert isinstance(result, dict) def test_no_matches(self): """Handle text with no matches.""" text = "This is just random text with no invoice data." result = extract_from_text(text) assert isinstance(result, dict) # Values should be None or missing class TestGermanNumberFormat: """Test German number format conversion.""" def test_simple_decimal(self): """Convert simple German decimal: 123,45""" text = "Brutto: 123,45" result = extract_from_text(text) assert result["gross_amount"] == 123.45 def test_thousands_separator(self): """Convert with thousands: 1.234,56""" text = "Brutto: 1.234,56" result = extract_from_text(text) assert result["gross_amount"] == 1234.56 def test_large_amount(self): """Convert large amount: 10.000,00""" text = "Brutto: 10.000,00" result = extract_from_text(text) assert result["gross_amount"] == 10000.00 def test_integer_amount(self): """Convert integer: 100,00""" text = "Netto: 100,00" result = extract_from_text(text) assert result["net_amount"] == 100.00 class TestGermanDateFormat: """Test German date format conversion.""" def test_dd_mm_yyyy(self): """Convert DD.MM.YYYY to ISO format.""" text = "Rechnungsdatum: 15.11.2024" result = extract_from_text(text) assert result["invoice_date"] == "2024-11-15" def test_d_m_yyyy(self): """Convert D.M.YYYY (single digits) to ISO format.""" text = "Rechnungsdatum: 4.2.2025" result = extract_from_text(text) assert result["invoice_date"] == "2025-02-04" class TestRealPDFExtraction: """Test extraction from actual PDF fixtures.""" def test_extract_from_en16931_sample(self): """Extract fields from EN16931_Einfach.pdf.""" with open("tests/fixtures/EN16931_Einfach.pdf", "rb") as f: pdf_bytes = f.read() # Extract text text = extract_text_from_pdf(pdf_bytes) # Extract fields result = extract_from_text(text) # Verify key fields were found assert result is not None # Check if at least some fields were extracted # (exact values may vary based on PDF layout) extracted_fields = [ k for k, v in result.items() if v is not None and not k.endswith("_confidence") ] assert len(extracted_fields) > 0