fix(extractor): detect xrechnung XML attachments

2026-05-21 14:45:20 +02:00
parent 18a902bcc9
commit d497a4cb5b
5 changed files with 159 additions and 5 deletions
@@ -6,7 +6,6 @@ Supports ZUGFeRD 2.x profiles: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED.

 import io
 import time
-from typing import Any

 from facturx import get_flavor, get_level, get_xml_from_pdf
 from lxml import etree
@@ -402,6 +401,84 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int:
        return 0


+def _find_xml_attachment_fallback(pdf_bytes: bytes) -> tuple[str | None, bytes | None]:
+    """Find XML attachment by MIME type or filename extension (fallback mechanism).
+
+    This handles XRechnung and other XML-based invoices with non-standard
+    filenames (e.g., 'xrechnung.xml' instead of 'factur-x.xml') that the
+    factur-x library does not recognize.
+
+    Args:
+        pdf_bytes: Raw PDF file content.
+
+    Returns:
+        Tuple of (filename, xml_bytes) or (None, None) if not found.
+    """
+    import logging
+
+    logger = logging.getLogger(__name__)
+
+    try:
+        pdf_stream = io.BytesIO(pdf_bytes)
+        reader = PdfReader(pdf_stream)
+    except (PdfReadError, PyPdfError) as e:
+        logger.debug(
+            "Failed to read PDF for XML attachment fallback search: %s",
+            str(e),
+        )
+        return (None, None)
+    except Exception as e:
+        raise ExtractionError(
+            error_code="fallback_pdf_read_error",
+            message="Unexpected error reading PDF for XML attachment fallback",
+            details=str(e),
+        ) from e
+
+    # First pass: check MIME type via Subtype in Filespec or by content type
+    # Common XML MIME types: application/xml, text/xml
+    for att in reader.attachment_list:
+        if not att.content:
+            continue
+
+        subtype = getattr(att, "subtype", None) or getattr(att, "mediatype", None)
+
+        if subtype and ("xml" in str(subtype).lower() or "xml" in att.name.lower()):
+            try:
+                etree.fromstring(att.content)
+                logger.debug(
+                    "Found XML attachment via MIME fallback: %s",
+                    att.name,
+                )
+                return (att.name, att.content)
+            except etree.XMLSyntaxError:
+                logger.debug(
+                    "Attachment '%s' has XML MIME but invalid XML content",
+                    att.name,
+                )
+                continue
+
+    # Second pass: check by filename extension for any XML file
+    for att in reader.attachment_list:
+        if not att.content:
+            continue
+        if att.name.lower().endswith(".xml"):
+            try:
+                etree.fromstring(att.content)
+                logger.debug(
+                    "Found XML attachment via extension fallback: %s",
+                    att.name,
+                )
+                return (att.name, att.content)
+            except etree.XMLSyntaxError:
+                logger.debug(
+                    "Attachment '%s' has .xml extension but invalid XML content",
+                    att.name,
+                )
+                continue
+
+    return (None, None)
+
+
 def _profile_from_urn(urn: str) -> str:
    """Extract a short profile name from a Factur-X/ZUGFeRD URN.

@@ -440,6 +517,9 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
            details=f"Size: {len(pdf_bytes)} bytes",
        )

+    xml_filename = None
+    xml_bytes = None
+
    try:
        xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False)
    except Exception as e:
@@ -449,16 +529,21 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
                error_code="password_protected_pdf",
                message="PDF is password protected",
                details=str(e),
-            )
+            ) from e
        if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg:
            raise ExtractionError(
                error_code="invalid_pdf", message="Invalid PDF file", details=str(e)
-            )
+            ) from e
        raise ExtractionError(
            error_code="corrupt_pdf",
            message="Failed to extract XML from PDF",
            details=str(e),
-        )
+        ) from e
+
+    # Fallback: If factur-x library didn't find XML, check attachments by MIME type
+    # This handles XRechnung and other XML attachments with non-standard filenames
+    if not xml_bytes:
+        xml_filename, xml_bytes = _find_xml_attachment_fallback(pdf_bytes)

    if not xml_bytes:
        pdf_text = extract_text_from_pdf(pdf_bytes)
@@ -481,9 +566,10 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
    try:
        level = get_level(xml_root, flavor)
    except ValueError:
+        # GuidelineSpecifiedDocumentContextParameter uses ram: namespace
        urn_nodes = xml_root.xpath(
            "//rsm:ExchangedDocumentContext/"
-            "rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
+            "ram:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
            namespaces=NAMESPACES,
        )
        urn = urn_nodes[0] if urn_nodes else ""