fix(extractor): detect xrechnung XML attachments

This commit is contained in:
m3ta-chiron
2026-05-21 14:45:20 +02:00
parent 18a902bcc9
commit d497a4cb5b
5 changed files with 159 additions and 5 deletions

View File

@@ -7,6 +7,8 @@ then implementation makes them pass (GREEN phase).
import pytest
import base64
from pypdf import PdfReader, PdfWriter
class TestExtractionError:
"""Test ExtractionError exception class."""
@@ -299,3 +301,60 @@ class TestZUGFeRDProfileVariations:
# Profile should be detected (BASIC, BASICWL, etc.)
assert result.zugferd_profil is not None
assert result.xml_data is not None
class TestXRechnungExtraction:
"""Test extraction from XRechnung PDFs with non-standard filenames."""
def test_xrechnung_by_xml_extension_fallback(self):
"""Test XRechnung with 'xrechnung.xml' filename is detected via fallback.
This test verifies the fallback mechanism when factur-x library
doesn't recognize non-standard filenames like 'xrechnung.xml'.
We create a synthetic PDF with a non-standard attachment name to
exercise the fallback code path.
"""
from io import BytesIO
from src.extractor import extract_zugferd, _find_xml_attachment_fallback
# Load valid XML from existing test fixture
with open("tests/fixtures/validXRechnung.pdf", "rb") as f:
orig_reader = PdfReader(f)
xml_content = None
for att in orig_reader.attachment_list:
xml_content = att.content
break
if xml_content is None:
pytest.fail("Test fixture has no XML attachment")
# Create a new PDF with a non-standard attachment name
pdf_writer = PdfWriter()
pdf_writer.add_blank_page(width=72, height=72)
pdf_writer.add_attachment("xrechnung.xml", xml_content)
output = BytesIO()
pdf_writer.write(output)
pdf_bytes = output.getvalue()
# Verify factur-x does NOT recognize it (returns False/None)
from facturx import get_xml_from_pdf
facturx_result = get_xml_from_pdf(pdf_bytes, check_xsd=False)
assert facturx_result[0] is False, "factur-x should not recognize xrechnung.xml"
# Verify our fallback function DOES find it
fallback_result = _find_xml_attachment_fallback(pdf_bytes)
assert fallback_result[0] == "xrechnung.xml", (
"Fallback should find xrechnung.xml"
)
assert fallback_result[1] is not None, "Fallback should return XML content"
# Verify full extraction works via fallback
result = extract_zugferd(pdf_bytes)
assert result.is_zugferd is True
assert result.xml_raw is not None
assert result.xml_data is not None
assert result.xml_data.invoice_number is not None
assert result.extraction_meta.xml_attachment_name == "xrechnung.xml"