fix(extractor): detect xrechnung XML attachments

2026-05-21 14:45:20 +02:00
parent 18a902bcc9
commit d497a4cb5b
5 changed files with 159 additions and 5 deletions
@@ -0,0 +1,5 @@
 #!/usr/bin/env bash
 # Activate the devshell from the Nix flake
 # This loads all tools and environment variables defined in flake.nix
 use flake
@@ -55,3 +55,6 @@ opencode.json
 .sidecar-start.sh
 .sidecar-base
 .td-root
 # pi
 .pi*
@@ -0,0 +1 @@
 /nix/store/5f1zpyn23pabv4j30g651cy2bfkpi7f6-source/rules
@@ -6,7 +6,6 @@ Supports ZUGFeRD 2.x profiles: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED.
 import io
 import time
 from typing import Any
 from facturx import get_flavor, get_level, get_xml_from_pdf
 from lxml import etree
@@ -402,6 +401,84 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int:
        return 0
 def _find_xml_attachment_fallback(pdf_bytes: bytes) -> tuple[str | None, bytes | None]:
    """Find XML attachment by MIME type or filename extension (fallback mechanism).
    This handles XRechnung and other XML-based invoices with non-standard
    filenames (e.g., 'xrechnung.xml' instead of 'factur-x.xml') that the
    factur-x library does not recognize.
    Args:
        pdf_bytes: Raw PDF file content.
    Returns:
        Tuple of (filename, xml_bytes) or (None, None) if not found.
    """
    import logging
    logger = logging.getLogger(__name__)
    try:
        pdf_stream = io.BytesIO(pdf_bytes)
        reader = PdfReader(pdf_stream)
    except (PdfReadError, PyPdfError) as e:
        logger.debug(
            "Failed to read PDF for XML attachment fallback search: %s",
            str(e),
        )
        return (None, None)
    except Exception as e:
        raise ExtractionError(
            error_code="fallback_pdf_read_error",
            message="Unexpected error reading PDF for XML attachment fallback",
            details=str(e),
        ) from e
    # First pass: check MIME type via Subtype in Filespec or by content type
    # Common XML MIME types: application/xml, text/xml
    for att in reader.attachment_list:
        if not att.content:
            continue
        subtype = getattr(att, "subtype", None) or getattr(att, "mediatype", None)
        if subtype and ("xml" in str(subtype).lower() or "xml" in att.name.lower()):
            try:
                etree.fromstring(att.content)
                logger.debug(
                    "Found XML attachment via MIME fallback: %s",
                    att.name,
                )
                return (att.name, att.content)
            except etree.XMLSyntaxError:
                logger.debug(
                    "Attachment '%s' has XML MIME but invalid XML content",
                    att.name,
                )
                continue
    # Second pass: check by filename extension for any XML file
    for att in reader.attachment_list:
        if not att.content:
            continue
        if att.name.lower().endswith(".xml"):
            try:
                etree.fromstring(att.content)
                logger.debug(
                    "Found XML attachment via extension fallback: %s",
                    att.name,
                )
                return (att.name, att.content)
            except etree.XMLSyntaxError:
                logger.debug(
                    "Attachment '%s' has .xml extension but invalid XML content",
                    att.name,
                )
                continue
    return (None, None)
 def _profile_from_urn(urn: str) -> str:
    """Extract a short profile name from a Factur-X/ZUGFeRD URN.
@@ -440,6 +517,9 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
            details=f"Size: {len(pdf_bytes)} bytes",
        )
    xml_filename = None
    xml_bytes = None
    try:
        xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False)
    except Exception as e:
@@ -449,16 +529,21 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
                error_code="password_protected_pdf",
                message="PDF is password protected",
                details=str(e),
-            )
+            ) from e
        if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg:
            raise ExtractionError(
                error_code="invalid_pdf", message="Invalid PDF file", details=str(e)
-            )
+            ) from e
        raise ExtractionError(
            error_code="corrupt_pdf",
            message="Failed to extract XML from PDF",
            details=str(e),
-        )
+        ) from e
    # Fallback: If factur-x library didn't find XML, check attachments by MIME type
    # This handles XRechnung and other XML attachments with non-standard filenames
    if not xml_bytes:
        xml_filename, xml_bytes = _find_xml_attachment_fallback(pdf_bytes)
    if not xml_bytes:
        pdf_text = extract_text_from_pdf(pdf_bytes)
@@ -481,9 +566,10 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
    try:
        level = get_level(xml_root, flavor)
    except ValueError:
        # GuidelineSpecifiedDocumentContextParameter uses ram: namespace
        urn_nodes = xml_root.xpath(
            "//rsm:ExchangedDocumentContext/"
-            "rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
+            "ram:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
            namespaces=NAMESPACES,
        )
        urn = urn_nodes[0] if urn_nodes else ""
@@ -7,6 +7,8 @@ then implementation makes them pass (GREEN phase).
 import pytest
 import base64
 from pypdf import PdfReader, PdfWriter
 class TestExtractionError:
    """Test ExtractionError exception class."""
@@ -299,3 +301,60 @@ class TestZUGFeRDProfileVariations:
        # Profile should be detected (BASIC, BASICWL, etc.)
        assert result.zugferd_profil is not None
        assert result.xml_data is not None
 class TestXRechnungExtraction:
    """Test extraction from XRechnung PDFs with non-standard filenames."""
    def test_xrechnung_by_xml_extension_fallback(self):
        """Test XRechnung with 'xrechnung.xml' filename is detected via fallback.
        This test verifies the fallback mechanism when factur-x library
        doesn't recognize non-standard filenames like 'xrechnung.xml'.
        We create a synthetic PDF with a non-standard attachment name to
        exercise the fallback code path.
        """
        from io import BytesIO
        from src.extractor import extract_zugferd, _find_xml_attachment_fallback
        # Load valid XML from existing test fixture
        with open("tests/fixtures/validXRechnung.pdf", "rb") as f:
            orig_reader = PdfReader(f)
            xml_content = None
            for att in orig_reader.attachment_list:
                xml_content = att.content
                break
            if xml_content is None:
                pytest.fail("Test fixture has no XML attachment")
        # Create a new PDF with a non-standard attachment name
        pdf_writer = PdfWriter()
        pdf_writer.add_blank_page(width=72, height=72)
        pdf_writer.add_attachment("xrechnung.xml", xml_content)
        output = BytesIO()
        pdf_writer.write(output)
        pdf_bytes = output.getvalue()
        # Verify factur-x does NOT recognize it (returns False/None)
        from facturx import get_xml_from_pdf
        facturx_result = get_xml_from_pdf(pdf_bytes, check_xsd=False)
        assert facturx_result[0] is False, "factur-x should not recognize xrechnung.xml"
        # Verify our fallback function DOES find it
        fallback_result = _find_xml_attachment_fallback(pdf_bytes)
        assert fallback_result[0] == "xrechnung.xml", (
            "Fallback should find xrechnung.xml"
        )
        assert fallback_result[1] is not None, "Fallback should return XML content"
        # Verify full extraction works via fallback
        result = extract_zugferd(pdf_bytes)
        assert result.is_zugferd is True
        assert result.xml_raw is not None
        assert result.xml_data is not None
        assert result.xml_data.invoice_number is not None
        assert result.extraction_meta.xml_attachment_name == "xrechnung.xml"
		`@@ -0,0 +1 @@`
							`/nix/store/5f1zpyn23pabv4j30g651cy2bfkpi7f6-source/rules`