fix(extractor): detect xrechnung XML attachments
This commit is contained in:
@@ -7,6 +7,8 @@ then implementation makes them pass (GREEN phase).
|
||||
import pytest
|
||||
import base64
|
||||
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
|
||||
class TestExtractionError:
|
||||
"""Test ExtractionError exception class."""
|
||||
@@ -299,3 +301,60 @@ class TestZUGFeRDProfileVariations:
|
||||
# Profile should be detected (BASIC, BASICWL, etc.)
|
||||
assert result.zugferd_profil is not None
|
||||
assert result.xml_data is not None
|
||||
|
||||
|
||||
class TestXRechnungExtraction:
|
||||
"""Test extraction from XRechnung PDFs with non-standard filenames."""
|
||||
|
||||
def test_xrechnung_by_xml_extension_fallback(self):
|
||||
"""Test XRechnung with 'xrechnung.xml' filename is detected via fallback.
|
||||
|
||||
This test verifies the fallback mechanism when factur-x library
|
||||
doesn't recognize non-standard filenames like 'xrechnung.xml'.
|
||||
We create a synthetic PDF with a non-standard attachment name to
|
||||
exercise the fallback code path.
|
||||
"""
|
||||
from io import BytesIO
|
||||
|
||||
from src.extractor import extract_zugferd, _find_xml_attachment_fallback
|
||||
|
||||
# Load valid XML from existing test fixture
|
||||
with open("tests/fixtures/validXRechnung.pdf", "rb") as f:
|
||||
orig_reader = PdfReader(f)
|
||||
xml_content = None
|
||||
for att in orig_reader.attachment_list:
|
||||
xml_content = att.content
|
||||
break
|
||||
if xml_content is None:
|
||||
pytest.fail("Test fixture has no XML attachment")
|
||||
|
||||
# Create a new PDF with a non-standard attachment name
|
||||
pdf_writer = PdfWriter()
|
||||
pdf_writer.add_blank_page(width=72, height=72)
|
||||
pdf_writer.add_attachment("xrechnung.xml", xml_content)
|
||||
|
||||
output = BytesIO()
|
||||
pdf_writer.write(output)
|
||||
pdf_bytes = output.getvalue()
|
||||
|
||||
# Verify factur-x does NOT recognize it (returns False/None)
|
||||
from facturx import get_xml_from_pdf
|
||||
|
||||
facturx_result = get_xml_from_pdf(pdf_bytes, check_xsd=False)
|
||||
assert facturx_result[0] is False, "factur-x should not recognize xrechnung.xml"
|
||||
|
||||
# Verify our fallback function DOES find it
|
||||
fallback_result = _find_xml_attachment_fallback(pdf_bytes)
|
||||
assert fallback_result[0] == "xrechnung.xml", (
|
||||
"Fallback should find xrechnung.xml"
|
||||
)
|
||||
assert fallback_result[1] is not None, "Fallback should return XML content"
|
||||
|
||||
# Verify full extraction works via fallback
|
||||
result = extract_zugferd(pdf_bytes)
|
||||
|
||||
assert result.is_zugferd is True
|
||||
assert result.xml_raw is not None
|
||||
assert result.xml_data is not None
|
||||
assert result.xml_data.invoice_number is not None
|
||||
assert result.extraction_meta.xml_attachment_name == "xrechnung.xml"
|
||||
|
||||
Reference in New Issue
Block a user